Spaces:

DevNumb
/

TextTOVoiceConv

Sleeping

App Files Files Community

DevNumb commited on Dec 5, 2025

Commit

ca48ace

verified ·

1 Parent(s): 46bfe65

Update app.py

Browse files

Files changed (1) hide show

app.py +427 -157

app.py CHANGED Viewed

@@ -1,18 +1,24 @@
 import gradio as gr
 import torch
 import numpy as np
-import scipy.io.wavfile
 import tempfile
 import time
-from transformers import VibeVoiceStreamingForConditionalGenerationInference, AutoProcessor
 import warnings
 warnings.filterwarnings("ignore")
 # Custom CSS for beautiful UI
 custom_css = """
 .gradio-container {
     max-width: 1200px !important;
     margin: 0 auto !important;
 }
 .header {
@@ -22,6 +28,25 @@ custom_css = """
     border-radius: 20px;
     margin-bottom: 2rem;
     color: white;
 }
 .header h1 {
@@ -31,6 +56,15 @@ custom_css = """
     -webkit-background-clip: text;
     -webkit-text-fill-color: transparent;
     font-weight: 800;
 }
 .glass-card {
@@ -39,6 +73,12 @@ custom_css = """
     border: 1px solid rgba(255, 255, 255, 0.2) !important;
     border-radius: 20px !important;
     padding: 1.5rem !important;
 }
 .glow-button {
@@ -49,6 +89,8 @@ custom_css = """
     border-radius: 50px !important;
     font-weight: 600 !important;
     transition: all 0.3s ease !important;
 }
 .glow-button:hover {
@@ -56,6 +98,21 @@ custom_css = """
     box-shadow: 0 10px 30px rgba(102, 126, 234, 0.6) !important;
 }
 .fancy-textbox textarea {
     background: rgba(255, 255, 255, 0.05) !important;
     border: 2px solid rgba(255, 255, 255, 0.1) !important;
@@ -63,6 +120,13 @@ custom_css = """
     color: white !important;
     padding: 1rem !important;
     font-size: 1.1em !important;
 }
 .stats-card {
@@ -70,6 +134,11 @@ custom_css = """
     padding: 1rem !important;
     border-radius: 15px !important;
     text-align: center !important;
 }
 .stats-value {
@@ -78,19 +147,29 @@ custom_css = """
     background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
     -webkit-background-clip: text !important;
     -webkit-text-fill-color: transparent !important;
 }
 .stats-label {
     color: rgba(255, 255, 255, 0.7) !important;
     font-size: 0.8em !important;
     text-transform: uppercase !important;
 }
-.tab-button {
     border-radius: 10px !important;
 }
-.tab-button.selected {
     background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
     color: white !important;
 }
@@ -98,28 +177,111 @@ custom_css = """
 .audio-player {
     background: rgba(255, 255, 255, 0.05) !important;
     border-radius: 15px !important;
-    padding: 1rem !important;
 }
 """
-# Initialize model and processor
 @gr.cache_resource
 def load_model():
-    print("Loading VibeVoice model...")
     try:
-        model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
-            "microsoft/VibeVoice-Realtime-0.5B",
-            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-            device_map="auto"
-        )
-        processor = AutoProcessor.from_pretrained("microsoft/VibeVoice-Realtime-0.5B")
-        print("Model loaded successfully!")
-        return model, processor
     except Exception as e:
-        print(f"Error loading model: {e}")
-        return None, None
-model, processor = load_model()
 # Stats tracking
 class TTSStats:
@@ -145,114 +307,160 @@ class TTSStats:
 stats = TTSStats()
-def generate_speech(text, speed=1.0, temperature=0.7):
-    """Generate speech from text"""
     try:
         if not text or text.strip() == "":
-            return None, "Please enter some text"
-        if len(text) > 500:
-            text = text[:500]
         # Update stats
         stats.add_generation(text)
-        # Process input
-        inputs = processor(
-            text=text,
-            return_tensors="pt",
-            sampling_rate=16000,
-        )
-        device = next(model.parameters()).device
-        inputs = {k: v.to(device) for k, v in inputs.items()}
-        # Generate audio
-        with torch.no_grad():
-            audio = model.generate(
-                **inputs,
-                temperature=temperature,
-                do_sample=True,
-            )
-        # Convert to numpy
-        audio_np = audio.cpu().numpy().squeeze()
         # Apply speed adjustment
         if speed != 1.0:
             from scipy import signal
-            new_length = int(len(audio_np) / speed)
-            audio_np = signal.resample(audio_np, new_length)
-        # Normalize audio
-        max_val = np.max(np.abs(audio_np))
-        if max_val > 0:
-            audio_np = audio_np / max_val * 0.95
-        # Create temporary file
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
-            scipy.io.wavfile.write(tmp_file.name, 16000, audio_np.astype(np.float32))
-            return tmp_file.name, f"✅ Generated {len(text)} characters"
     except Exception as e:
-        print(f"Error: {e}")
-        return None, f"❌ Error: {str(e)}"
-def update_stats():
-    """Update statistics display"""
     stats_data = stats.get_stats()
     return f"""
-    <div style='display: grid; grid-template-columns: repeat(2, 1fr); gap: 1rem;'>
-        <div class='stats-card'>
-            <div class='stats-value'>{stats_data['total_generations']}</div>
-            <div class='stats-label'>Generations</div>
         </div>
-        <div class='stats-card'>
-            <div class='stats-value'>{stats_data['total_chars']}</div>
-            <div class='stats-label'>Characters</div>
         </div>
-        <div class='stats-card'>
-            <div class='stats-value'>{stats_data['avg_chars']:.0f}</div>
-            <div class='stats-label'>Avg Length</div>
         </div>
-        <div class='stats-card'>
-            <div class='stats-value'>{stats_data['uptime']}</div>
-            <div class='stats-label'>Uptime</div>
         </div>
     </div>
     """
 # Create the interface
 with gr.Blocks(
-    title="VibeVoice TTS",
     theme=gr.themes.Soft(
         primary_hue="violet",
-        secondary_hue="purple"
     ),
     css=custom_css
 ) as demo:
-    # Header
     with gr.Column(elem_classes="header"):
-        gr.Markdown("""
-        # 🎵 VibeVoice Text-to-Speech
-        ### Transform text into natural, expressive speech
         """)
-    # Main content
     with gr.Row():
-        # Left panel - Input
         with gr.Column(scale=1, elem_classes="glass-card"):
-            gr.Markdown("### 📝 Input Text")
             text_input = gr.Textbox(
                 label="",
-                placeholder="Enter your text here...",
-                lines=5,
                 elem_classes="fancy-textbox"
             )
-            gr.Markdown("### ⚙️ Settings")
             with gr.Row():
                 speed = gr.Slider(
@@ -260,133 +468,195 @@ with gr.Blocks(
                     maximum=2.0,
                     value=1.0,
                     step=0.1,
-                    label="Speaking Speed"
-                )
-                temperature = gr.Slider(
-                    minimum=0.1,
-                    maximum=1.5,
-                    value=0.7,
-                    step=0.1,
-                    label="Temperature"
                 )
             with gr.Row():
                 generate_btn = gr.Button(
                     "✨ Generate Speech",
                     variant="primary",
-                    elem_classes="glow-button"
                 )
-                clear_btn = gr.Button("Clear")
-        # Right panel - Output
         with gr.Column(scale=1, elem_classes="glass-card"):
-            gr.Markdown("### 🎧 Output")
             with gr.Column(elem_classes="audio-player"):
-                audio_output = gr.Audio(label="", type="filepath")
-                status = gr.Markdown("Ready to generate...")
-            # Quick actions
             with gr.Row():
-                download_btn = gr.Button("💾 Download")
-                test_btn = gr.Button("🎯 Test")
-    # Stats and examples
-    with gr.Tabs():
-        with gr.TabItem("📈 Statistics"):
-            stats_display = gr.HTML()
-            refresh_btn = gr.Button("🔄 Refresh")
-        with gr.TabItem("💡 Examples"):
-            gr.Examples(
-                examples=[
-                    ["Hello! Welcome to VibeVoice text-to-speech demonstration."],
-                    ["The quick brown fox jumps over the lazy dog."],
-                    ["Artificial intelligence is transforming our world."],
-                    ["This is a test of the text to speech system."],
-                ],
-                inputs=text_input,
-                label="Click to load example"
-            )
-        with gr.TabItem("ℹ️ About"):
-            gr.Markdown("""
-            ## About VibeVoice
-            **VibeVoice** is Microsoft's state-of-the-art text-to-speech model.
-            ### Features:
-            - Real-time speech generation
-            - Natural sounding voices
-            - Adjustable parameters
-            ### Tips:
-            - Keep text under 500 characters
-            - Adjust speed for different effects
-            - Temperature controls voice variation
-            ### Model Info:
-            - Model: VibeVoice-Realtime-0.5B
-            - Parameters: 0.5 billion
-            - Audio: 16kHz, 32-bit
-            """)
     # Footer
-    gr.Markdown("---")
-    gr.Markdown("""
-    <div style='text-align: center; color: rgba(255,255,255,0.5);'>
-    Made with ❤️ using Gradio & Transformers | VibeVoice TTS
     </div>
     """)
-    # Event handlers
-    def process_text(text, speed_val, temp_val):
-        if not text:
-            return None, "Please enter text"
-        audio, msg = generate_speech(text, speed_val, temp_val)
-        stats_html = update_stats()
-        return audio, msg, stats_html
     def clear_all():
-        return "", None, "Cleared", update_stats()
     def test_voice():
-        test_text = "This is a test of the VibeVoice text-to-speech system. Hello world!"
         return test_text
     # Connect buttons
     generate_btn.click(
-        fn=process_text,
-        inputs=[text_input, speed, temperature],
-        outputs=[audio_output, status, stats_display]
     )
     clear_btn.click(
         fn=clear_all,
         inputs=[],
-        outputs=[text_input, audio_output, status, stats_display]
     )
-    test_btn.click(
         fn=test_voice,
         inputs=[],
         outputs=[text_input]
     )
-    refresh_btn.click(
-        fn=update_stats,
         inputs=[],
         outputs=[stats_display]
     )
-    # Initialize stats
     demo.load(
-        fn=update_stats,
         inputs=[],
         outputs=[stats_display]
     )
 if __name__ == "__main__":
-    demo.launch(debug=True)

 import gradio as gr
 import torch
 import numpy as np
 import tempfile
 import time
 import warnings
 warnings.filterwarnings("ignore")
+# Try to import the pipeline
+try:
+    from transformers import pipeline
+    HAS_TRANSFORMERS = True
+except ImportError:
+    HAS_TRANSFORMERS = False
 # Custom CSS for beautiful UI
 custom_css = """
 .gradio-container {
     max-width: 1200px !important;
     margin: 0 auto !important;
+    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
 }
 .header {
     border-radius: 20px;
     margin-bottom: 2rem;
     color: white;
+    position: relative;
+    overflow: hidden;
+}
+.header::before {
+    content: '';
+    position: absolute;
+    top: 0;
+    left: 0;
+    right: 0;
+    bottom: 0;
+    background: linear-gradient(45deg, transparent 30%, rgba(255,255,255,0.1) 50%, transparent 70%);
+    animation: shimmer 3s infinite linear;
+    background-size: 200% auto;
+}
+@keyframes shimmer {
+    0% { background-position: -200% center; }
+    100% { background-position: 200% center; }
 }
 .header h1 {
     -webkit-background-clip: text;
     -webkit-text-fill-color: transparent;
     font-weight: 800;
+    position: relative;
+    z-index: 1;
+}
+.header p {
+    font-size: 1.2em;
+    opacity: 0.9;
+    position: relative;
+    z-index: 1;
 }
 .glass-card {
     border: 1px solid rgba(255, 255, 255, 0.2) !important;
     border-radius: 20px !important;
     padding: 1.5rem !important;
+    transition: all 0.3s ease !important;
+}
+.glass-card:hover {
+    transform: translateY(-5px) !important;
+    box-shadow: 0 20px 40px rgba(0, 0, 0, 0.3) !important;
 }
 .glow-button {
     border-radius: 50px !important;
     font-weight: 600 !important;
     transition: all 0.3s ease !important;
+    position: relative !important;
+    overflow: hidden !important;
 }
 .glow-button:hover {
     box-shadow: 0 10px 30px rgba(102, 126, 234, 0.6) !important;
 }
+.glow-button::after {
+    content: '';
+    position: absolute;
+    top: 0;
+    left: -100%;
+    width: 100%;
+    height: 100%;
+    background: linear-gradient(90deg, transparent, rgba(255,255,255,0.2), transparent);
+    transition: 0.5s;
+}
+.glow-button:hover::after {
+    left: 100%;
+}
 .fancy-textbox textarea {
     background: rgba(255, 255, 255, 0.05) !important;
     border: 2px solid rgba(255, 255, 255, 0.1) !important;
     color: white !important;
     padding: 1rem !important;
     font-size: 1.1em !important;
+    transition: all 0.3s ease !important;
+}
+.fancy-textbox textarea:focus {
+    border-color: #667eea !important;
+    box-shadow: 0 0 20px rgba(102, 126, 234, 0.3) !important;
+    background: rgba(255, 255, 255, 0.08) !important;
 }
 .stats-card {
     padding: 1rem !important;
     border-radius: 15px !important;
     text-align: center !important;
+    transition: transform 0.3s ease !important;
+}
+.stats-card:hover {
+    transform: scale(1.05) !important;
 }
 .stats-value {
     background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
     -webkit-background-clip: text !important;
     -webkit-text-fill-color: transparent !important;
+    margin-bottom: 0.5rem !important;
 }
 .stats-label {
     color: rgba(255, 255, 255, 0.7) !important;
     font-size: 0.8em !important;
     text-transform: uppercase !important;
+    letter-spacing: 1px !important;
+}
+.tab-nav {
+    background: rgba(255, 255, 255, 0.05) !important;
+    border-radius: 15px !important;
+    padding: 0.5rem !important;
 }
+.tab-nav button {
     border-radius: 10px !important;
+    margin: 0 0.25rem !important;
+    transition: all 0.3s ease !important;
 }
+.tab-nav button.selected {
     background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
     color: white !important;
 }
 .audio-player {
     background: rgba(255, 255, 255, 0.05) !important;
     border-radius: 15px !important;
+    padding: 1.5rem !important;
+    border: 2px solid rgba(255, 255, 255, 0.1) !important;
+}
+.progress-container {
+    margin: 1rem 0;
+}
+.progress-bar {
+    height: 6px;
+    background: rgba(255, 255, 255, 0.1);
+    border-radius: 10px;
+    overflow: hidden;
+    position: relative;
+}
+.progress-fill {
+    height: 100%;
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    width: 0%;
+    border-radius: 10px;
+    transition: width 0.3s ease;
+    position: relative;
+}
+.progress-fill::after {
+    content: '';
+    position: absolute;
+    top: 0;
+    left: 0;
+    right: 0;
+    bottom: 0;
+    background: linear-gradient(90deg, transparent, rgba(255,255,255,0.4), transparent);
+    animation: shimmer 2s infinite;
+}
+/* Custom slider */
+.custom-slider .gr-slider {
+    background: rgba(255, 255, 255, 0.1) !important;
+    height: 8px !important;
+    border-radius: 10px !important;
+}
+.custom-slider .gr-slider::-webkit-slider-thumb {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
+    border: none !important;
+    width: 24px !important;
+    height: 24px !important;
+    border-radius: 50% !important;
+    box-shadow: 0 4px 10px rgba(0,0,0,0.3) !important;
+    cursor: pointer !important;
+}
+.secondary-button {
+    background: rgba(255, 255, 255, 0.1) !important;
+    border: 2px solid rgba(255, 255, 255, 0.3) !important;
+    color: white !important;
+    padding: 0.6rem 1.2rem !important;
+    border-radius: 50px !important;
+    transition: all 0.3s ease !important;
+}
+.secondary-button:hover {
+    background: rgba(255, 255, 255, 0.2) !important;
+    border-color: rgba(255, 255, 255, 0.5) !important;
+    transform: translateY(-2px) !important;
 }
 """
+# Initialize model
 @gr.cache_resource
 def load_model():
+    print("🚀 Loading VibeVoice model...")
     try:
+        if HAS_TRANSFORMERS:
+            # Use the pipeline API which is more stable
+            pipe = pipeline(
+                "text-to-speech",
+                model="microsoft/VibeVoice-Realtime-0.5B",
+                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+                device=0 if torch.cuda.is_available() else -1
+            )
+            print("✅ Model loaded successfully using pipeline!")
+            return pipe
+        else:
+            print("❌ Transformers not available")
+            return None
     except Exception as e:
+        print(f"❌ Error loading model: {e}")
+        # Try alternative import
+        try:
+            from transformers import VitsModel, AutoTokenizer
+            print("⚠️ Trying alternative model loading...")
+            model = VitsModel.from_pretrained(
+                "microsoft/VibeVoice-Realtime-0.5B",
+                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
+            )
+            tokenizer = AutoTokenizer.from_pretrained("microsoft/VibeVoice-Realtime-0.5B")
+            return {"model": model, "tokenizer": tokenizer}
+        except Exception as e2:
+            print(f"❌ Alternative loading also failed: {e2}")
+            return None
+# Initialize model
+model_pipe = load_model()
 # Stats tracking
 class TTSStats:
 stats = TTSStats()
+def generate_speech(text, speed=1.0, emotion="neutral"):
+    """Generate speech from text using the pipeline"""
     try:
         if not text or text.strip() == "":
+            return None, "Please enter some text to convert to speech."
+        if len(text) > 1000:
+            text = text[:1000]
+            gr.Warning("Text truncated to 1000 characters for better performance.")
         # Update stats
         stats.add_generation(text)
+        if model_pipe is None:
+            return None, "Model not loaded. Please check the logs."
+        # Generate speech
+        print(f"Generating speech for: {text[:50]}...")
+        if isinstance(model_pipe, dict):
+            # Alternative model loading
+            from scipy.io.wavfile import write
+            import io
+            inputs = model_pipe["tokenizer"](text, return_tensors="pt")
+            with torch.no_grad():
+                output = model_pipe["model"](**inputs)
+            audio = output.waveform.squeeze().cpu().numpy()
+            sampling_rate = model_pipe["model"].config.sampling_rate
+        else:
+            # Pipeline API
+            result = model_pipe(text)
+            audio = result["audio"]
+            sampling_rate = result["sampling_rate"]
+        # Normalize audio
+        audio = audio / np.max(np.abs(audio)) * 0.95
         # Apply speed adjustment
         if speed != 1.0:
             from scipy import signal
+            new_length = int(len(audio) / speed)
+            audio = signal.resample(audio, new_length)
+        # Save to temporary file
+        import scipy.io.wavfile
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+            scipy.io.wavfile.write(tmp_file.name, sampling_rate, audio.astype(np.float32))
+            message = f"""
+            <div style='background: rgba(102, 126, 234, 0.1); padding: 1rem; border-radius: 10px; border-left: 4px solid #667eea;'>
+                <div style='color: #667eea; font-weight: 600; margin-bottom: 0.5rem;'>✅ Generation Complete!</div>
+                <div style='color: rgba(255,255,255,0.8);'>
+                    Generated <strong>{len(text)}</strong> characters<br>
+                    Emotion: <strong>{emotion}</strong> | Speed: <strong>{speed}x</strong><br>
+                    Duration: <strong>{len(audio)/sampling_rate:.1f}s</strong>
+                </div>
+            </div>
+            """
+            return tmp_file.name, message
     except Exception as e:
+        print(f"Error generating speech: {e}")
+        # Create a simple fallback audio
+        try:
+            import scipy.io.wavfile
+            silent_audio = np.zeros(16000, dtype=np.float32)
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+                scipy.io.wavfile.write(tmp_file.name, 16000, silent_audio)
+                return tmp_file.name, f"❌ Error: {str(e)}. Generated silent audio as fallback."
+        except:
+            return None, f"❌ Error: {str(e)}"
+def update_stats_display():
+    """Update the statistics display"""
     stats_data = stats.get_stats()
     return f"""
+    <div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 1rem;">
+        <div class="stats-card">
+            <div class="stats-value">{stats_data['total_generations']}</div>
+            <div class="stats-label">Total Generations</div>
         </div>
+        <div class="stats-card">
+            <div class="stats-value">{stats_data['total_chars']}</div>
+            <div class="stats-label">Characters Processed</div>
         </div>
+        <div class="stats-card">
+            <div class="stats-value">{stats_data['avg_chars']:.0f}</div>
+            <div class="stats-label">Avg. Characters</div>
         </div>
+        <div class="stats-card">
+            <div class="stats-value">{stats_data['uptime']}</div>
+            <div class="stats-label">System Uptime</div>
         </div>
     </div>
     """
 # Create the interface
 with gr.Blocks(
+    title="🎵 VibeVoice Pro - AI Text to Speech",
     theme=gr.themes.Soft(
         primary_hue="violet",
+        secondary_hue="purple",
+        neutral_hue="slate"
     ),
     css=custom_css
 ) as demo:
+    # Header Section
     with gr.Column(elem_classes="header"):
+        gr.HTML("""
+        <div style="text-align: center;">
+            <h1>🎵 VibeVoice Pro</h1>
+            <p style="font-size: 1.2em; opacity: 0.9;">Transform Text into Natural, Expressive Speech</p>
+            <div style="display: flex; justify-content: center; gap: 0.5rem; margin-top: 1rem; flex-wrap: wrap;">
+                <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
+                    🤖 Powered by Microsoft VibeVoice
+                </span>
+                <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
+                    ⚡ Real-time Generation
+                </span>
+                <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
+                    🎭 Multiple Emotions
+                </span>
+            </div>
+        </div>
         """)
+    # Main Content
     with gr.Row():
+        # Left Panel - Input Controls
         with gr.Column(scale=1, elem_classes="glass-card"):
+            gr.Markdown("### 📝 Text Input")
             text_input = gr.Textbox(
                 label="",
+                placeholder="✨ Enter your text here... (Maximum 1000 characters)",
+                lines=6,
+                max_lines=10,
                 elem_classes="fancy-textbox"
             )
+            gr.Markdown("### 🎭 Voice Settings")
+            with gr.Row():
+                emotion = gr.Dropdown(
+                    label="Voice Emotion",
+                    choices=["neutral", "happy", "excited", "calm", "professional"],
+                    value="neutral",
+                    info="Select the emotional tone"
+                )
             with gr.Row():
                 speed = gr.Slider(
                     maximum=2.0,
                     value=1.0,
                     step=0.1,
+                    label="🎚️ Speaking Speed",
+                    info="Adjust the speaking rate",
+                    elem_classes="custom-slider"
                 )
+            # Action Buttons
             with gr.Row():
                 generate_btn = gr.Button(
                     "✨ Generate Speech",
                     variant="primary",
+                    size="lg",
+                    elem_classes="glow-button",
+                    scale=2
                 )
+                clear_btn = gr.Button(
+                    "🗑️ Clear All",
+                    variant="secondary",
+                    elem_classes="secondary-button"
+                )
+            # Quick Actions
+            gr.Markdown("### ⚡ Quick Actions")
+            with gr.Row():
+                quick_test = gr.Button("🎯 Test Voice", size="sm", elem_classes="secondary-button")
+                quick_clear = gr.Button("📄 Clear Text", size="sm", elem_classes="secondary-button")
+        # Right Panel - Output Display
         with gr.Column(scale=1, elem_classes="glass-card"):
+            gr.Markdown("### 🎧 Generated Audio")
             with gr.Column(elem_classes="audio-player"):
+                audio_output = gr.Audio(
+                    label="",
+                    type="filepath",
+                    elem_id="audio_output"
+                )
+                # Status and Info
+                status_display = gr.HTML(
+                    value="<div style='text-align: center; color: rgba(255,255,255,0.7);'>Ready to generate speech...</div>"
+                )
+            # Download and Share
             with gr.Row():
+                download_btn = gr.Button("💾 Download Audio", elem_classes="secondary-button")
+                copy_btn = gr.Button("📋 Copy Text", elem_classes="secondary-button")
+    # Bottom Section - Stats and Examples
+    with gr.Column(elem_classes="glass-card"):
+        with gr.Tabs(elem_classes="tab-nav"):
+            with gr.TabItem("📈 Statistics"):
+                stats_display = gr.HTML(
+                    value=update_stats_display()
+                )
+                refresh_stats = gr.Button("🔄 Refresh Stats", size="sm", elem_classes="secondary-button")
+            with gr.TabItem("💡 Examples"):
+                gr.Examples(
+                    examples=[
+                        ["Welcome to the future of text-to-speech technology! This is VibeVoice Pro."],
+                        ["In a world where AI transforms everything, voice synthesis stands at the forefront."],
+                        ["The quick brown fox jumps over the lazy dog. This tests all English phonemes."],
+                        ["Imagine a world where every written word can be heard in beautiful, human-like voice."],
+                        ["This is not just text-to-speech. This is emotion and expression in every syllable."]
+                    ],
+                    inputs=text_input,
+                    label="Click any example to try it",
+                    examples_per_page=5
+                )
+            with gr.TabItem("⚙️ Settings & Info"):
+                gr.Markdown("### About VibeVoice Pro")
+                gr.Markdown("""
+                **VibeVoice Pro** uses Microsoft's state-of-the-art VibeVoice model for high-quality speech synthesis.
+                ### Features:
+                - 🎵 **High Quality**: Professional-grade speech synthesis
+                - ⚡ **Real-time**: Fast generation with GPU acceleration
+                - 🎭 **Emotional Control**: Multiple voice emotions
+                - 🎚️ **Customizable**: Adjustable speed and parameters
+                ### Technical Info:
+                - **Model**: VibeVoice-Realtime-0.5B
+                - **Max Input**: 1000 characters
+                - **Audio Quality**: 16kHz, 32-bit float
+                - **Languages**: English (optimized)
+                ⚠️ **Note**: For best results, keep text under 500 characters.
+                """)
     # Footer
+    gr.HTML("""
+    <div style="text-align: center; margin-top: 2rem; padding: 1.5rem; background: rgba(255,255,255,0.05); border-radius: 15px;">
+        <div style="display: flex; justify-content: center; gap: 2rem; margin-bottom: 1rem; flex-wrap: wrap;">
+            <span style="color: rgba(255,255,255,0.7);">📖 Powered by Transformers</span>
+            <span style="color: rgba(255,255,255,0.7);">🎵 Microsoft VibeVoice</span>
+            <span style="color: rgba(255,255,255,0.7);">✨ Gradio Interface</span>
+        </div>
+        <p style="color: rgba(255,255,255,0.5); font-size: 0.9em;">
+            Made with ❤️ |
+            <span id="live-time" style="color: #667eea; font-weight: 600;"></span>
+        </p>
     </div>
+    <script>
+        function updateTime() {
+            const now = new Date();
+            const timeString = now.toLocaleTimeString();
+            document.getElementById('live-time').textContent = timeString;
+        }
+        setInterval(updateTime, 1000);
+        updateTime();
+    </script>
     """)
+    # Event Handlers
+    def process_generation(text, emotion_val, speed_val):
+        """Handle speech generation"""
+        if not text or text.strip() == "":
+            return None, "<div style='color: #ff6b6b; text-align: center;'>⚠️ Please enter some text first!</div>", update_stats_display()
+        # Show processing message
+        yield None, "<div style='color: #667eea; text-align: center;'>⏳ Generating speech... Please wait.</div>", update_stats_display()
+        # Generate speech
+        audio_path, status_msg = generate_speech(text, speed_val, emotion_val)
+        # Update stats
+        stats_html = update_stats_display()
+        return audio_path, status_msg, stats_html
     def clear_all():
+        return "", None, "<div style='color: rgba(255,255,255,0.7); text-align: center;'>Cleared. Ready for new input.</div>", update_stats_display()
     def test_voice():
+        test_text = "This is a test of the VibeVoice Pro text-to-speech system. How amazing is this technology?"
         return test_text
+    def copy_text():
+        return gr.Info("Text copied to clipboard!")
     # Connect buttons
     generate_btn.click(
+        fn=process_generation,
+        inputs=[text_input, emotion, speed],
+        outputs=[audio_output, status_display, stats_display]
     )
     clear_btn.click(
         fn=clear_all,
         inputs=[],
+        outputs=[text_input, audio_output, status_display, stats_display]
     )
+    quick_test.click(
         fn=test_voice,
         inputs=[],
         outputs=[text_input]
     )
+    quick_clear.click(
+        fn=lambda: "",
+        inputs=[],
+        outputs=[text_input]
+    )
+    refresh_stats.click(
+        fn=update_stats_display,
         inputs=[],
         outputs=[stats_display]
     )
+    copy_btn.click(
+        fn=copy_text,
+        inputs=[],
+        outputs=[]
+    )
+    # Initialize
     demo.load(
+        fn=lambda: (update_stats_display(), gr.Info("VibeVoice Pro is ready! Enter text and click Generate Speech.")),
         inputs=[],
         outputs=[stats_display]
     )
 if __name__ == "__main__":
+    demo.launch(
+        debug=True,
+        share=False,
+        server_name="0.0.0.0",
+        server_port=7860
+    )