Spaces:

DevNumb
/

TextTOVoiceConv

Sleeping

App Files Files Community

DevNumb commited on Dec 5, 2025

Commit

bc55770

verified ·

1 Parent(s): 6da43ce

Update app.py

Browse files

Files changed (1) hide show

app.py +159 -546

app.py CHANGED Viewed

@@ -3,128 +3,52 @@ import torch
 import numpy as np
 import scipy.io.wavfile
 import tempfile
-import os
 import time
-import plotly.graph_objects as go
-from datetime import datetime
-from PIL import Image
-import io
-import base64
 from transformers import VibeVoiceStreamingForConditionalGenerationInference, AutoProcessor
 import warnings
 warnings.filterwarnings("ignore")
 # Custom CSS for beautiful UI
 custom_css = """
-/* Main Theme Variables */
-:root {
-    --primary-gradient: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-    --secondary-gradient: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
-    --accent-color: #8a2be2;
-    --dark-bg: #0f172a;
-    --card-bg: rgba(255, 255, 255, 0.1);
-    --glass-effect: backdrop-filter: blur(10px);
 }
-/* Custom Scrollbar */
-::-webkit-scrollbar {
-    width: 10px;
-}
-::-webkit-scrollbar-track {
-    background: rgba(255, 255, 255, 0.1);
-    border-radius: 10px;
-}
-::-webkit-scrollbar-thumb {
-    background: var(--primary-gradient);
-    border-radius: 10px;
-}
-/* Header Animation */
-@keyframes float {
-    0%, 100% { transform: translateY(0px); }
-    50% { transform: translateY(-10px); }
-}
-@keyframes pulse-glow {
-    0%, 100% { box-shadow: 0 0 20px rgba(102, 126, 234, 0.5); }
-    50% { box-shadow: 0 0 40px rgba(102, 126, 234, 0.8); }
-}
-@keyframes shimmer {
-    0% { background-position: -200% center; }
-    100% { background-position: 200% center; }
-}
-/* Header Styles */
-.header-container {
     text-align: center;
     padding: 2rem;
-    background: var(--primary-gradient);
     border-radius: 20px;
     margin-bottom: 2rem;
-    position: relative;
-    overflow: hidden;
-}
-.header-container::before {
-    content: '';
-    position: absolute;
-    top: 0;
-    left: 0;
-    right: 0;
-    bottom: 0;
-    background: linear-gradient(45deg, transparent 30%, rgba(255,255,255,0.1) 50%, transparent 70%);
-    animation: shimmer 3s infinite linear;
-    background-size: 200% auto;
 }
-.header-title {
-    font-size: 3.5em !important;
     background: linear-gradient(45deg, #fff, #f0f0f0);
     -webkit-background-clip: text;
     -webkit-text-fill-color: transparent;
-    margin-bottom: 0.5rem !important;
-    font-weight: 800 !important;
-    text-shadow: 0 2px 10px rgba(0,0,0,0.2);
-    animation: float 3s ease-in-out infinite;
-}
-.header-subtitle {
-    font-size: 1.2em !important;
-    color: rgba(255, 255, 255, 0.9) !important;
-    margin-bottom: 1rem !important;
 }
-/* Card Styles */
 .glass-card {
     background: rgba(255, 255, 255, 0.1) !important;
     backdrop-filter: blur(10px) !important;
     border: 1px solid rgba(255, 255, 255, 0.2) !important;
     border-radius: 20px !important;
-    padding: 2rem !important;
-    transition: all 0.3s ease !important;
-}
-.glass-card:hover {
-    transform: translateY(-5px) !important;
-    box-shadow: 0 20px 40px rgba(0, 0, 0, 0.3) !important;
 }
-/* Button Styles */
 .glow-button {
-    background: var(--primary-gradient) !important;
     border: none !important;
     color: white !important;
-    padding: 1rem 2rem !important;
     border-radius: 50px !important;
-    font-size: 1.1em !important;
     font-weight: 600 !important;
     transition: all 0.3s ease !important;
-    position: relative !important;
-    overflow: hidden !important;
-    animation: pulse-glow 2s infinite !important;
 }
 .glow-button:hover {
@@ -132,189 +56,68 @@ custom_css = """
     box-shadow: 0 10px 30px rgba(102, 126, 234, 0.6) !important;
 }
-.glow-button::after {
-    content: '';
-    position: absolute;
-    top: 0;
-    left: -100%;
-    width: 100%;
-    height: 100%;
-    background: linear-gradient(90deg, transparent, rgba(255,255,255,0.2), transparent);
-    transition: 0.5s;
-}
-.glow-button:hover::after {
-    left: 100%;
-}
-.secondary-button {
-    background: rgba(255, 255, 255, 0.1) !important;
-    border: 2px solid rgba(255, 255, 255, 0.3) !important;
-    color: white !important;
-    padding: 0.8rem 1.5rem !important;
-    border-radius: 50px !important;
-    font-size: 1em !important;
-    transition: all 0.3s ease !important;
-}
-.secondary-button:hover {
-    background: rgba(255, 255, 255, 0.2) !important;
-    border-color: rgba(255, 255, 255, 0.5) !important;
-    transform: translateY(-2px) !important;
-}
-/* Input Styles */
 .fancy-textbox textarea {
     background: rgba(255, 255, 255, 0.05) !important;
     border: 2px solid rgba(255, 255, 255, 0.1) !important;
     border-radius: 15px !important;
     color: white !important;
     font-size: 1.1em !important;
-    padding: 1.5rem !important;
-    transition: all 0.3s ease !important;
 }
-.fancy-textbox textarea:focus {
-    border-color: #667eea !important;
-    box-shadow: 0 0 20px rgba(102, 126, 234, 0.3) !important;
-    background: rgba(255, 255, 255, 0.08) !important;
-}
-/* Slider Styles */
-.custom-slider .gr-slider {
-    background: rgba(255, 255, 255, 0.1) !important;
-    height: 8px !important;
-    border-radius: 10px !important;
-}
-.custom-slider .gr-slider::-webkit-slider-thumb {
-    background: var(--primary-gradient) !important;
-    border: none !important;
-    width: 24px !important;
-    height: 24px !important;
-    border-radius: 50% !important;
-    box-shadow: 0 4px 10px rgba(0,0,0,0.3) !important;
-}
-/* Audio Player Styles */
-.audio-container {
-    background: rgba(255, 255, 255, 0.05) !important;
-    border-radius: 20px !important;
-    padding: 2rem !important;
-    border: 2px solid rgba(255, 255, 255, 0.1) !important;
-}
-/* Stats Card */
 .stats-card {
     background: rgba(255, 255, 255, 0.08) !important;
-    padding: 1.5rem !important;
     border-radius: 15px !important;
     text-align: center !important;
-    transition: transform 0.3s ease !important;
-}
-.stats-card:hover {
-    transform: scale(1.05) !important;
 }
 .stats-value {
-    font-size: 2.5em !important;
     font-weight: 700 !important;
-    background: var(--primary-gradient) !important;
     -webkit-background-clip: text !important;
     -webkit-text-fill-color: transparent !important;
-    margin-bottom: 0.5rem !important;
 }
 .stats-label {
     color: rgba(255, 255, 255, 0.7) !important;
-    font-size: 0.9em !important;
     text-transform: uppercase !important;
-    letter-spacing: 1px !important;
-}
-/* Progress Bar */
-.progress-container {
-    margin: 2rem 0;
-}
-.progress-bar {
-    height: 8px;
-    background: rgba(255, 255, 255, 0.1);
-    border-radius: 10px;
-    overflow: hidden;
-    position: relative;
 }
-.progress-fill {
-    height: 100%;
-    background: var(--primary-gradient);
-    width: 0%;
-    border-radius: 10px;
-    transition: width 0.3s ease;
-    position: relative;
 }
-.progress-fill::after {
-    content: '';
-    position: absolute;
-    top: 0;
-    left: 0;
-    right: 0;
-    bottom: 0;
-    background: linear-gradient(90deg, transparent, rgba(255,255,255,0.4), transparent);
-    animation: shimmer 2s infinite;
 }
-/* Tab Styles */
-.tab-nav {
     background: rgba(255, 255, 255, 0.05) !important;
     border-radius: 15px !important;
-    padding: 0.5rem !important;
-}
-.tab-nav button {
-    border-radius: 10px !important;
-    margin: 0 0.25rem !important;
-    transition: all 0.3s ease !important;
-}
-.tab-nav button.selected {
-    background: var(--primary-gradient) !important;
-}
-/* Notification */
-.notification {
-    position: fixed;
-    top: 20px;
-    right: 20px;
-    background: var(--primary-gradient);
-    color: white;
-    padding: 1rem 1.5rem;
-    border-radius: 10px;
-    box-shadow: 0 10px 30px rgba(0,0,0,0.3);
-    z-index: 1000;
-    animation: slideIn 0.3s ease;
-}
-@keyframes slideIn {
-    from { transform: translateX(100%); opacity: 0; }
-    to { transform: translateX(0); opacity: 1; }
 }
 """
 # Initialize model and processor
 @gr.cache_resource
 def load_model():
-    print("🚀 Loading VibeVoice model...")
-    model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
-        "microsoft/VibeVoice-Realtime-0.5B",
-        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-        device_map="auto"
-    )
-    processor = AutoProcessor.from_pretrained("microsoft/VibeVoice-Realtime-0.5B")
-    print("✅ Model loaded successfully!")
-    return model, processor
 model, processor = load_model()
@@ -333,106 +136,30 @@ class TTSStats:
         uptime = time.time() - self.start_time
         hours, remainder = divmod(uptime, 3600)
         minutes, seconds = divmod(remainder, 60)
         return {
             'total_generations': self.total_generations,
             'total_chars': self.total_chars,
             'avg_chars': self.total_chars / max(self.total_generations, 1),
-            'uptime': f"{int(hours)}h {int(minutes)}m {int(seconds)}s"
         }
 stats = TTSStats()
-def create_waveform_visualization(audio_data, sr=16000):
-    """Create a beautiful waveform visualization"""
-    if audio_data is None:
-        return None
-    # Sample the audio data for visualization
-    samples = audio_data[::10]  # Downsample for performance
-    x = np.arange(len(samples)) / (sr / 10)
-    fig = go.Figure()
-    # Add waveform trace with gradient fill
-    fig.add_trace(go.Scatter(
-        x=x,
-        y=samples,
-        fill='tozeroy',
-        mode='lines',
-        line=dict(
-            color='#667eea',
-            width=2,
-            shape='spline'
-        ),
-        fillcolor='rgba(102, 126, 234, 0.3)',
-        name='Waveform'
-    ))
-    # Add envelope trace
-    envelope = np.abs(samples)
-    fig.add_trace(go.Scatter(
-        x=x,
-        y=envelope,
-        mode='lines',
-        line=dict(
-            color='#764ba2',
-            width=1,
-            dash='dash'
-        ),
-        name='Envelope'
-    ))
-    fig.update_layout(
-        title="🎵 Audio Waveform",
-        plot_bgcolor='rgba(255, 255, 255, 0.05)',
-        paper_bgcolor='rgba(0, 0, 0, 0)',
-        font=dict(color='white'),
-        xaxis=dict(
-            title="Time (s)",
-            gridcolor='rgba(255, 255, 255, 0.1)',
-            zerolinecolor='rgba(255, 255, 255, 0.2)'
-        ),
-        yaxis=dict(
-            title="Amplitude",
-            gridcolor='rgba(255, 255, 255, 0.1)',
-            zerolinecolor='rgba(255, 255, 255, 0.2)'
-        ),
-        showlegend=True,
-        legend=dict(
-            bgcolor='rgba(255, 255, 255, 0.1)',
-            bordercolor='rgba(255, 255, 255, 0.2)'
-        ),
-        margin=dict(l=50, r=50, t=50, b=50)
-    )
-    return fig
-def generate_speech(text, voice_style="neutral", speed=1.0, temperature=0.7):
-    """
-    Generate speech from text with enhanced parameters
-    """
     try:
         if not text or text.strip() == "":
-            return None, None, "Please enter some text to convert to speech."
         # Update stats
         stats.add_generation(text)
-        # Add voice style prompt
-        style_prompts = {
-            "neutral": "",
-            "excited": "with excited and energetic voice",
-            "calm": "with calm and soothing voice",
-            "professional": "with professional and clear voice",
-            "storytelling": "with engaging storytelling voice"
-        }
-        prompt = f"{text} {style_prompts.get(voice_style, '')}".strip()
         # Process input
         inputs = processor(
-            text=prompt,
             return_tensors="pt",
             sampling_rate=16000,
         )
@@ -440,14 +167,12 @@ def generate_speech(text, voice_style="neutral", speed=1.0, temperature=0.7):
         device = next(model.parameters()).device
         inputs = {k: v.to(device) for k, v in inputs.items()}
-        # Generate with progress callback simulation
         with torch.no_grad():
             audio = model.generate(
                 **inputs,
                 temperature=temperature,
                 do_sample=True,
-                length_penalty=1.0,
-                repetition_penalty=2.0,
             )
         # Convert to numpy
@@ -467,95 +192,67 @@ def generate_speech(text, voice_style="neutral", speed=1.0, temperature=0.7):
         # Create temporary file
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             scipy.io.wavfile.write(tmp_file.name, 16000, audio_np.astype(np.float32))
-            # Create waveform visualization
-            waveform_fig = create_waveform_visualization(audio_np)
-            return tmp_file.name, waveform_fig, "✅ Speech generated successfully!"
     except Exception as e:
         print(f"Error: {e}")
-        return None, None, f"❌ Error: {str(e)}"
-def update_stats_display():
-    """Update the statistics display"""
     stats_data = stats.get_stats()
     return f"""
-    <div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 1rem;">
-        <div class="stats-card">
-            <div class="stats-value">{stats_data['total_generations']}</div>
-            <div class="stats-label">Total Generations</div>
         </div>
-        <div class="stats-card">
-            <div class="stats-value">{stats_data['total_chars']}</div>
-            <div class="stats-label">Characters Processed</div>
         </div>
-        <div class="stats-card">
-            <div class="stats-value">{stats_data['avg_chars']:.0f}</div>
-            <div class="stats-label">Avg. Characters</div>
         </div>
-        <div class="stats-card">
-            <div class="stats-value">{stats_data['uptime']}</div>
-            <div class="stats-label">System Uptime</div>
         </div>
     </div>
     """
-# Create the main interface
 with gr.Blocks(
-    title="🎵 VibeVoice Pro - AI Text to Speech",
     theme=gr.themes.Soft(
         primary_hue="violet",
-        secondary_hue="purple",
-        neutral_hue="slate"
     ),
     css=custom_css
 ) as demo:
-    # Header Section
-    with gr.Column(elem_classes="header-container"):
-        gr.HTML("""
-        <div style="text-align: center;">
-            <h1 class="header-title">🎵 VibeVoice Pro</h1>
-            <p class="header-subtitle">Transform Text into Natural, Expressive Speech</p>
-            <div style="display: flex; justify-content: center; gap: 0.5rem; margin-top: 1rem;">
-                <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
-                    🤖 Powered by Microsoft VibeVoice
-                </span>
-                <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
-                    ⚡ Real-time Generation
-                </span>
-                <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
-                    🎭 Multiple Voice Styles
-                </span>
-            </div>
-        </div>
         """)
-    # Main Content
     with gr.Row():
-        # Left Panel - Input Controls
         with gr.Column(scale=1, elem_classes="glass-card"):
-            gr.Markdown("### 📝 Text Input")
             text_input = gr.Textbox(
                 label="",
-                placeholder="✨ Enter your text here... (Maximum 1000 characters)",
-                lines=6,
-                max_lines=10,
-                elem_classes="fancy-textbox",
-                scale=2
             )
-            gr.Markdown("### 🎭 Voice Settings")
-            with gr.Row():
-                voice_style = gr.Dropdown(
-                    label="Voice Style",
-                    choices=["neutral", "excited", "calm", "professional", "storytelling"],
-                    value="neutral",
-                    info="Select the emotional tone of the voice"
-                )
             with gr.Row():
                 speed = gr.Slider(
@@ -563,9 +260,7 @@ with gr.Blocks(
                     maximum=2.0,
                     value=1.0,
                     step=0.1,
-                    label="🎚️ Speaking Speed",
-                    info="Adjust the speaking rate",
-                    elem_classes="custom-slider"
                 )
                 temperature = gr.Slider(
@@ -573,207 +268,125 @@ with gr.Blocks(
                     maximum=1.5,
                     value=0.7,
                     step=0.1,
-                    label="🔥 Temperature",
-                    info="Control creativity vs consistency",
-                    elem_classes="custom-slider"
                 )
-            # Action Buttons
             with gr.Row():
                 generate_btn = gr.Button(
                     "✨ Generate Speech",
                     variant="primary",
-                    size="lg",
-                    elem_classes="glow-button",
-                    scale=2
                 )
-                clear_btn = gr.Button(
-                    "🗑️ Clear All",
-                    variant="secondary",
-                    elem_classes="secondary-button"
-                )
-            # Quick Actions
-            gr.Markdown("### ⚡ Quick Actions")
-            with gr.Row():
-                quick_test = gr.Button("🎯 Test Voice", size="sm", elem_classes="secondary-button")
-                quick_clear = gr.Button("📄 Clear Text", size="sm", elem_classes="secondary-button")
-        # Right Panel - Output Display
         with gr.Column(scale=1, elem_classes="glass-card"):
-            gr.Markdown("### 🎧 Generated Audio")
-            with gr.Column(elem_classes="audio-container"):
-                audio_output = gr.Audio(
-                    label="",
-                    type="filepath",
-                    elem_id="audio_output",
-                    scale=1
-                )
-                # Visualizer
-                waveform_plot = gr.Plot(
-                    label="📊 Audio Waveform",
-                    show_label=True
-                )
-                # Status and Info
-                status_display = gr.HTML(
-                    value="<div style='text-align: center; color: rgba(255,255,255,0.7);'>Ready to generate speech...</div>"
-                )
-            # Download and Share
             with gr.Row():
-                download_btn = gr.Button("💾 Download Audio", elem_classes="secondary-button")
-                share_btn = gr.Button("🔗 Generate Share Link", elem_classes="secondary-button")
-    # Bottom Section - Stats and Examples
-    with gr.Column(elem_classes="glass-card"):
-        with gr.Tabs(elem_classes="tab-nav"):
-            with gr.TabItem("📈 Statistics"):
-                stats_display = gr.HTML(
-                    value=update_stats_display()
-                )
-                refresh_stats = gr.Button("🔄 Refresh Stats", size="sm")
-            with gr.TabItem("💡 Examples"):
-                gr.Examples(
-                    examples=[
-                        ["Welcome to the future of text-to-speech technology! This is VibeVoice Pro, creating natural and expressive voices."],
-                        ["In a world where AI transforms everything, voice synthesis stands at the forefront of innovation and creativity."],
-                        ["The quick brown fox jumps over the lazy dog. This classic sentence tests all English phonemes."],
-                        ["Imagine a world where every written word can be heard in the most beautiful, human-like voice possible."],
-                        ["This is not just text-to-speech. This is emotion, expression, and personality in every syllable."]
-                    ],
-                    inputs=text_input,
-                    label="Click any example to try it",
-                    examples_per_page=5
-                )
-            with gr.TabItem("⚙️ Settings"):
-                gr.Markdown("### Advanced Settings")
-                with gr.Row():
-                    auto_play = gr.Checkbox(label="Auto-play generated audio", value=True)
-                    show_waveform = gr.Checkbox(label="Show waveform visualization", value=True)
-                    save_history = gr.Checkbox(label="Save generation history", value=False)
-                gr.Markdown("### About")
-                gr.Markdown("""
-                **VibeVoice Pro** uses Microsoft's state-of-the-art VibeVoice model for high-quality speech synthesis.
-                - **Model**: VibeVoice-Realtime-0.5B
-                - **Max Input**: 1000 characters
-                - **Output Quality**: 16kHz, 32-bit float
-                - **Languages**: English (optimized)
-                ⚠️ **Note**: For best results, keep text under 500 characters.
-                """)
     # Footer
-    gr.HTML("""
-    <div style="text-align: center; margin-top: 2rem; padding: 1.5rem; background: rgba(255,255,255,0.05); border-radius: 15px;">
-        <div style="display: flex; justify-content: center; gap: 2rem; margin-bottom: 1rem;">
-            <a href="#" style="color: rgba(255,255,255,0.7); text-decoration: none; transition: color 0.3s;">📖 Documentation</a>
-            <a href="#" style="color: rgba(255,255,255,0.7); text-decoration: none; transition: color 0.3s;">🐛 Report Issue</a>
-            <a href="#" style="color: rgba(255,255,255,0.7); text-decoration: none; transition: color 0.3s;">⭐ Star Project</a>
-            <a href="#" style="color: rgba(255,255,255,0.7); text-decoration: none; transition: color 0.3s;">🔄 API Access</a>
-        </div>
-        <p style="color: rgba(255,255,255,0.5); font-size: 0.9em;">
-            Made with ❤️ using Gradio & Transformers |
-            <span id="live-time" style="color: #667eea;"></span>
-        </p>
     </div>
-    <script>
-        function updateTime() {
-            const now = new Date();
-            const timeString = now.toLocaleTimeString();
-            document.getElementById('live-time').textContent = timeString;
-        }
-        setInterval(updateTime, 1000);
-        updateTime();
-        // Add smooth scroll behavior
-        document.addEventListener('DOMContentLoaded', function() {
-            document.querySelectorAll('a[href^="#"]').forEach(anchor => {
-                anchor.addEventListener('click', function (e) {
-                    e.preventDefault();
-                    const target = document.querySelector(this.getAttribute('href'));
-                    if (target) {
-                        target.scrollIntoView({ behavior: 'smooth' });
-                    }
-                });
-            });
-        });
-    </script>
     """)
-    # Event Handlers
-    def process_generation(text, voice_style, speed, temperature):
-        """Handle speech generation with visual feedback"""
-        if not text or text.strip() == "":
-            return None, None, "<div style='color: #ff6b6b; text-align: center;'>⚠️ Please enter some text first!</div>"
-        # Show processing message
-        yield None, None, "<div style='color: #667eea; text-align: center;'>⏳ Generating speech... Please wait.</div>"
-        # Generate speech
-        audio_path, waveform, status = generate_speech(text, voice_style, speed, temperature)
-        # Update stats display
-        stats_html = update_stats_display()
-        return audio_path, waveform, f"""
-        <div style="background: rgba(102, 126, 234, 0.1); padding: 1rem; border-radius: 10px; border-left: 4px solid #667eea;">
-            <div style="color: #667eea; font-weight: 600; margin-bottom: 0.5rem;">✅ Generation Complete!</div>
-            <div style="color: rgba(255,255,255,0.8);">
-                Generated {len(text)} characters | Voice: {voice_style.title()} | Speed: {speed}x
-            </div>
-        </div>
-        """
     # Connect buttons
     generate_btn.click(
-        fn=process_generation,
-        inputs=[text_input, voice_style, speed, temperature],
-        outputs=[audio_output, waveform_plot, status_display]
     )
     clear_btn.click(
-        fn=lambda: ["", None, None, 1.0, 0.7, "neutral", "<div style='color: rgba(255,255,255,0.7); text-align: center;'>Cleared. Ready for new input.</div>"],
-        inputs=[],
-        outputs=[text_input, audio_output, waveform_plot, speed, temperature, voice_style, status_display]
-    )
-    quick_test.click(
-        fn=lambda: "This is a test of the VibeVoice Pro text-to-speech system. How amazing is this?",
         inputs=[],
-        outputs=[text_input]
     )
-    quick_clear.click(
-        fn=lambda: "",
         inputs=[],
         outputs=[text_input]
     )
-    refresh_stats.click(
-        fn=update_stats_display,
         inputs=[],
         outputs=[stats_display]
     )
-    # Keyboard shortcuts
     demo.load(
-        fn=lambda: gr.Info("💡 Tip: Press Ctrl+Enter to generate speech faster!"),
         inputs=[],
-        outputs=[]
     )
 if __name__ == "__main__":
-    demo.launch(
-        debug=True,
-        share=False,
-        server_name="0.0.0.0",
-        server_port=7860,
-        favicon_path=None
-    )

 import numpy as np
 import scipy.io.wavfile
 import tempfile
 import time
 from transformers import VibeVoiceStreamingForConditionalGenerationInference, AutoProcessor
 import warnings
 warnings.filterwarnings("ignore")
 # Custom CSS for beautiful UI
 custom_css = """
+.gradio-container {
+    max-width: 1200px !important;
+    margin: 0 auto !important;
 }
+.header {
     text-align: center;
     padding: 2rem;
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
     border-radius: 20px;
     margin-bottom: 2rem;
+    color: white;
 }
+.header h1 {
+    font-size: 3em;
+    margin-bottom: 0.5rem;
     background: linear-gradient(45deg, #fff, #f0f0f0);
     -webkit-background-clip: text;
     -webkit-text-fill-color: transparent;
+    font-weight: 800;
 }
 .glass-card {
     background: rgba(255, 255, 255, 0.1) !important;
     backdrop-filter: blur(10px) !important;
     border: 1px solid rgba(255, 255, 255, 0.2) !important;
     border-radius: 20px !important;
+    padding: 1.5rem !important;
 }
 .glow-button {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
     border: none !important;
     color: white !important;
+    padding: 0.8rem 1.5rem !important;
     border-radius: 50px !important;
     font-weight: 600 !important;
     transition: all 0.3s ease !important;
 }
 .glow-button:hover {
     box-shadow: 0 10px 30px rgba(102, 126, 234, 0.6) !important;
 }
 .fancy-textbox textarea {
     background: rgba(255, 255, 255, 0.05) !important;
     border: 2px solid rgba(255, 255, 255, 0.1) !important;
     border-radius: 15px !important;
     color: white !important;
+    padding: 1rem !important;
     font-size: 1.1em !important;
 }
 .stats-card {
     background: rgba(255, 255, 255, 0.08) !important;
+    padding: 1rem !important;
     border-radius: 15px !important;
     text-align: center !important;
 }
 .stats-value {
+    font-size: 2em !important;
     font-weight: 700 !important;
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
     -webkit-background-clip: text !important;
     -webkit-text-fill-color: transparent !important;
 }
 .stats-label {
     color: rgba(255, 255, 255, 0.7) !important;
+    font-size: 0.8em !important;
     text-transform: uppercase !important;
 }
+.tab-button {
+    border-radius: 10px !important;
 }
+.tab-button.selected {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
+    color: white !important;
 }
+.audio-player {
     background: rgba(255, 255, 255, 0.05) !important;
     border-radius: 15px !important;
+    padding: 1rem !important;
 }
 """
 # Initialize model and processor
 @gr.cache_resource
 def load_model():
+    print("Loading VibeVoice model...")
+    try:
+        model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
+            "microsoft/VibeVoice-Realtime-0.5B",
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            device_map="auto"
+        )
+        processor = AutoProcessor.from_pretrained("microsoft/VibeVoice-Realtime-0.5B")
+        print("Model loaded successfully!")
+        return model, processor
+    except Exception as e:
+        print(f"Error loading model: {e}")
+        return None, None
 model, processor = load_model()
         uptime = time.time() - self.start_time
         hours, remainder = divmod(uptime, 3600)
         minutes, seconds = divmod(remainder, 60)
         return {
             'total_generations': self.total_generations,
             'total_chars': self.total_chars,
             'avg_chars': self.total_chars / max(self.total_generations, 1),
+            'uptime': f"{int(hours)}h {int(minutes)}m"
         }
 stats = TTSStats()
+def generate_speech(text, speed=1.0, temperature=0.7):
+    """Generate speech from text"""
     try:
         if not text or text.strip() == "":
+            return None, "Please enter some text"
+        if len(text) > 500:
+            text = text[:500]
         # Update stats
         stats.add_generation(text)
         # Process input
         inputs = processor(
+            text=text,
             return_tensors="pt",
             sampling_rate=16000,
         )
         device = next(model.parameters()).device
         inputs = {k: v.to(device) for k, v in inputs.items()}
+        # Generate audio
         with torch.no_grad():
             audio = model.generate(
                 **inputs,
                 temperature=temperature,
                 do_sample=True,
             )
         # Convert to numpy
         # Create temporary file
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             scipy.io.wavfile.write(tmp_file.name, 16000, audio_np.astype(np.float32))
+            return tmp_file.name, f"✅ Generated {len(text)} characters"
     except Exception as e:
         print(f"Error: {e}")
+        return None, f"❌ Error: {str(e)}"
+def update_stats():
+    """Update statistics display"""
     stats_data = stats.get_stats()
     return f"""
+    <div style='display: grid; grid-template-columns: repeat(2, 1fr); gap: 1rem;'>
+        <div class='stats-card'>
+            <div class='stats-value'>{stats_data['total_generations']}</div>
+            <div class='stats-label'>Generations</div>
         </div>
+        <div class='stats-card'>
+            <div class='stats-value'>{stats_data['total_chars']}</div>
+            <div class='stats-label'>Characters</div>
         </div>
+        <div class='stats-card'>
+            <div class='stats-value'>{stats_data['avg_chars']:.0f}</div>
+            <div class='stats-label'>Avg Length</div>
         </div>
+        <div class='stats-card'>
+            <div class='stats-value'>{stats_data['uptime']}</div>
+            <div class='stats-label'>Uptime</div>
         </div>
     </div>
     """
+# Create the interface
 with gr.Blocks(
+    title="VibeVoice TTS",
     theme=gr.themes.Soft(
         primary_hue="violet",
+        secondary_hue="purple"
     ),
     css=custom_css
 ) as demo:
+    # Header
+    with gr.Column(elem_classes="header"):
+        gr.Markdown("""
+        # 🎵 VibeVoice Text-to-Speech
+        ### Transform text into natural, expressive speech
         """)
+    # Main content
     with gr.Row():
+        # Left panel - Input
         with gr.Column(scale=1, elem_classes="glass-card"):
+            gr.Markdown("### 📝 Input Text")
             text_input = gr.Textbox(
                 label="",
+                placeholder="Enter your text here...",
+                lines=5,
+                elem_classes="fancy-textbox"
             )
+            gr.Markdown("### ⚙️ Settings")
             with gr.Row():
                 speed = gr.Slider(
                     maximum=2.0,
                     value=1.0,
                     step=0.1,
+                    label="Speaking Speed"
                 )
                 temperature = gr.Slider(
                     maximum=1.5,
                     value=0.7,
                     step=0.1,
+                    label="Temperature"
                 )
             with gr.Row():
                 generate_btn = gr.Button(
                     "✨ Generate Speech",
                     variant="primary",
+                    elem_classes="glow-button"
                 )
+                clear_btn = gr.Button("Clear")
+        # Right panel - Output
         with gr.Column(scale=1, elem_classes="glass-card"):
+            gr.Markdown("### 🎧 Output")
+            with gr.Column(elem_classes="audio-player"):
+                audio_output = gr.Audio(label="", type="filepath")
+                status = gr.Markdown("Ready to generate...")
+            # Quick actions
             with gr.Row():
+                download_btn = gr.Button("💾 Download")
+                test_btn = gr.Button("🎯 Test")
+    # Stats and examples
+    with gr.Tabs():
+        with gr.TabItem("📈 Statistics"):
+            stats_display = gr.HTML()
+            refresh_btn = gr.Button("🔄 Refresh")
+        with gr.TabItem("💡 Examples"):
+            gr.Examples(
+                examples=[
+                    ["Hello! Welcome to VibeVoice text-to-speech demonstration."],
+                    ["The quick brown fox jumps over the lazy dog."],
+                    ["Artificial intelligence is transforming our world."],
+                    ["This is a test of the text to speech system."],
+                ],
+                inputs=text_input,
+                label="Click to load example"
+            )
+        with gr.TabItem("ℹ️ About"):
+            gr.Markdown("""
+            ## About VibeVoice
+            **VibeVoice** is Microsoft's state-of-the-art text-to-speech model.
+            ### Features:
+            - Real-time speech generation
+            - Natural sounding voices
+            - Adjustable parameters
+            ### Tips:
+            - Keep text under 500 characters
+            - Adjust speed for different effects
+            - Temperature controls voice variation
+            ### Model Info:
+            - Model: VibeVoice-Realtime-0.5B
+            - Parameters: 0.5 billion
+            - Audio: 16kHz, 32-bit
+            """)
     # Footer
+    gr.Markdown("---")
+    gr.Markdown("""
+    <div style='text-align: center; color: rgba(255,255,255,0.5);'>
+    Made with ❤️ using Gradio & Transformers | VibeVoice TTS
     </div>
     """)
+    # Event handlers
+    def process_text(text, speed_val, temp_val):
+        if not text:
+            return None, "Please enter text"
+        audio, msg = generate_speech(text, speed_val, temp_val)
+        stats_html = update_stats()
+        return audio, msg, stats_html
+    def clear_all():
+        return "", None, "Cleared", update_stats()
+    def test_voice():
+        test_text = "This is a test of the VibeVoice text-to-speech system. Hello world!"
+        return test_text
     # Connect buttons
     generate_btn.click(
+        fn=process_text,
+        inputs=[text_input, speed, temperature],
+        outputs=[audio_output, status, stats_display]
     )
     clear_btn.click(
+        fn=clear_all,
         inputs=[],
+        outputs=[text_input, audio_output, status, stats_display]
     )
+    test_btn.click(
+        fn=test_voice,
         inputs=[],
         outputs=[text_input]
     )
+    refresh_btn.click(
+        fn=update_stats,
         inputs=[],
         outputs=[stats_display]
     )
+    # Initialize stats
     demo.load(
+        fn=update_stats,
         inputs=[],
+        outputs=[stats_display]
     )
 if __name__ == "__main__":
+    demo.launch(debug=True)