Spaces:

DevNumb
/

TextTOVoiceConv

Sleeping

App Files Files Community

DevNumb commited on Dec 5, 2025

Commit

136a3d3

verified ·

1 Parent(s): d94ebbf

Update app.py

Browse files

Files changed (1) hide show

app.py +298 -532

app.py CHANGED Viewed

@@ -5,18 +5,32 @@ import tempfile
 import time
 import warnings
 import scipy.io.wavfile
 warnings.filterwarnings("ignore")
-# Inline CSS with black text input
 css = """
 <style>
 .gradio-container {
-    max-width: 1200px !important;
     margin: 0 auto !important;
     font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
-    background: linear-gradient(135deg, #0f172a 0%, #1e293b 100%);
-    min-height: 100vh;
-    padding: 2rem;
 }
 .header {
@@ -30,23 +44,6 @@ css = """
     overflow: hidden;
 }
-.header::before {
-    content: '';
-    position: absolute;
-    top: 0;
-    left: 0;
-    right: 0;
-    bottom: 0;
-    background: linear-gradient(45deg, transparent 30%, rgba(255,255,255,0.1) 50%, transparent 70%);
-    animation: shimmer 3s infinite linear;
-    background-size: 200% auto;
-}
-@keyframes shimmer {
-    0% { background-position: -200% center; }
-    100% { background-position: 200% center; }
-}
 .header h1 {
     font-size: 3em;
     margin-bottom: 0.5rem;
@@ -54,632 +51,401 @@ css = """
     -webkit-background-clip: text;
     -webkit-text-fill-color: transparent;
     font-weight: 800;
-    position: relative;
-    z-index: 1;
-}
-.header p {
-    font-size: 1.2em;
-    opacity: 0.9;
-    position: relative;
-    z-index: 1;
-}
-.glass-card {
-    background: rgba(255, 255, 255, 0.1) !important;
-    backdrop-filter: blur(10px) !important;
-    border: 1px solid rgba(255, 255, 255, 0.2) !important;
-    border-radius: 20px !important;
-    padding: 1.5rem !important;
-    transition: all 0.3s ease !important;
-}
-.glass-card:hover {
-    transform: translateY(-5px) !important;
-    box-shadow: 0 20px 40px rgba(0, 0, 0, 0.3) !important;
-}
-.glow-button {
-    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
-    border: none !important;
-    color: white !important;
-    padding: 0.8rem 1.5rem !important;
-    border-radius: 50px !important;
-    font-weight: 600 !important;
-    transition: all 0.3s ease !important;
-    position: relative !important;
-    overflow: hidden !important;
 }
-.glow-button:hover {
-    transform: scale(1.05) !important;
-    box-shadow: 0 10px 30px rgba(102, 126, 234, 0.6) !important;
 }
-.glow-button::after {
-    content: '';
-    position: absolute;
-    top: 0;
-    left: -100%;
-    width: 100%;
-    height: 100%;
-    background: linear-gradient(90deg, transparent, rgba(255,255,255,0.2), transparent);
-    transition: 0.5s;
 }
-.glow-button:hover::after {
-    left: 100%;
 }
-/* BLACK TEXT INPUT */
 textarea {
-    background: rgba(255, 255, 255, 0.95) !important;
-    border: 2px solid rgba(102, 126, 234, 0.3) !important;
     border-radius: 15px !important;
-    color: #1e293b !important; /* Dark text color */
     padding: 1rem !important;
-    font-size: 1.1em !important;
-    transition: all 0.3s ease !important;
-}
-textarea:focus {
-    border-color: #667eea !important;
-    box-shadow: 0 0 20px rgba(102, 126, 234, 0.5) !important;
-    background: white !important;
-    color: #1e293b !important;
 }
 textarea::placeholder {
     color: #666 !important;
-    opacity: 0.8 !important;
 }
-.stats-card {
-    background: rgba(255, 255, 255, 0.08) !important;
-    padding: 1rem !important;
-    border-radius: 15px !important;
-    text-align: center !important;
-    transition: transform 0.3s ease !important;
-}
-.stats-card:hover {
-    transform: scale(1.05) !important;
 }
-.stats-value {
-    font-size: 2em !important;
-    font-weight: 700 !important;
-    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
-    -webkit-background-clip: text !important;
-    -webkit-text-fill-color: transparent !important;
-    margin-bottom: 0.5rem !important;
 }
-.stats-label {
-    color: rgba(255, 255, 255, 0.7) !important;
-    font-size: 0.8em !important;
-    text-transform: uppercase !important;
-    letter-spacing: 1px !important;
 }
-.tab-nav {
-    background: rgba(255, 255, 255, 0.05) !important;
-    border-radius: 15px !important;
-    padding: 0.5rem !important;
 }
-.tab-nav button {
-    border-radius: 10px !important;
-    margin: 0 0.25rem !important;
-    transition: all 0.3s ease !important;
-    color: rgba(255, 255, 255, 0.7) !important;
 }
-.tab-nav button.selected {
-    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
-    color: white !important;
 }
 .audio-player {
-    background: rgba(255, 255, 255, 0.05) !important;
-    border-radius: 15px !important;
-    padding: 1.5rem !important;
-    border: 2px solid rgba(255, 255, 255, 0.1) !important;
-}
-input[type="range"] {
-    background: rgba(255, 255, 255, 0.1) !important;
-    height: 8px !important;
-    border-radius: 10px !important;
-}
-input[type="range"]::-webkit-slider-thumb {
-    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
-    border: none !important;
-    width: 24px !important;
-    height: 24px !important;
-    border-radius: 50% !important;
-    box-shadow: 0 4px 10px rgba(0,0,0,0.3) !important;
-    cursor: pointer !important;
 }
-.secondary-button {
-    background: rgba(255, 255, 255, 0.1) !important;
-    border: 2px solid rgba(255, 255, 255, 0.3) !important;
-    color: white !important;
-    padding: 0.6rem 1.2rem !important;
-    border-radius: 50px !important;
-    transition: all 0.3s ease !important;
 }
-.secondary-button:hover {
-    background: rgba(255, 255, 255, 0.2) !important;
-    border-color: rgba(255, 255, 255, 0.5) !important;
-    transform: translateY(-2px) !important;
 }
-.dropdown {
-    background: rgba(255, 255, 255, 0.1) !important;
-    color: white !important;
-    border-radius: 10px !important;
-}
-.dropdown option {
-    background: #1e293b !important;
-    color: white !important;
-}
-.markdown {
-    color: rgba(255, 255, 255, 0.9) !important;
-}
-.markdown h1, .markdown h2, .markdown h3 {
-    color: white !important;
 }
 </style>
 """
-# Global variable for model
-_tts_model = None
-_tts_processor = None
-def load_model():
-    """Load the VibeVoice model directly"""
-    global _tts_model, _tts_processor
-    if _tts_model is None:
-        print("🚀 Loading VibeVoice model...")
-        try:
-            # Try direct import first
-            from transformers import VibeVoiceStreamingForConditionalGenerationInference, AutoProcessor
-            _tts_model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
-                "microsoft/VibeVoice-Realtime-0.5B",
-                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-                device_map="auto"
-            )
-            _tts_processor = AutoProcessor.from_pretrained("microsoft/VibeVoice-Realtime-0.5B")
-            print("✅ VibeVoice model loaded successfully!")
-        except ImportError as e:
-            print(f"❌ Import error: {e}")
-            print("⚠️ Trying alternative import...")
-            try:
-                # Alternative import
-                from transformers import AutoModelForTextToSpeech, AutoProcessor
-                _tts_model = AutoModelForTextToSpeech.from_pretrained(
-                    "microsoft/VibeVoice-Realtime-0.5B",
-                    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-                    device_map="auto"
-                )
-                _tts_processor = AutoProcessor.from_pretrained("microsoft/VibeVoice-Realtime-0.5B")
-                print("✅ Model loaded with AutoModelForTextToSpeech!")
-            except Exception as e2:
-                print(f"❌ All imports failed: {e2}")
-                print("⚠️ Falling back to simple tone generation")
-                _tts_model = "simple"
-        except Exception as e:
-            print(f"❌ Model loading error: {e}")
-            print("⚠️ Falling back to simple tone generation")
-            _tts_model = "simple"
-    return _tts_model, _tts_processor
-# Stats tracking
-class TTSStats:
     def __init__(self):
-        self.total_generations = 0
-        self.total_chars = 0
-        self.start_time = time.time()
-    def add_generation(self, text):
-        self.total_generations += 1
-        self.total_chars += len(text)
-    def get_stats(self):
-        uptime = time.time() - self.start_time
-        hours, remainder = divmod(uptime, 3600)
-        minutes, seconds = divmod(remainder, 60)
         return {
-            'total_generations': self.total_generations,
-            'total_chars': self.total_chars,
-            'avg_chars': self.total_chars / max(self.total_generations, 1),
-            'uptime': f"{int(hours)}h {int(minutes)}m"
         }
-stats = TTSStats()
-def generate_simple_tone(text, sampling_rate=16000):
-    """Generate a simple tone for fallback"""
-    duration = min(len(text) * 0.05, 5)
-    t = np.linspace(0, duration, int(sampling_rate * duration))
-    base_freq = 220 + (hash(text) % 200)
-    audio = 0.5 * np.sin(2 * np.pi * base_freq * t)
-    audio += 0.2 * np.sin(2 * np.pi * base_freq * 2 * t)
-    audio += 0.1 * np.sin(2 * np.pi * base_freq * 3 * t)
     envelope = np.exp(-2 * t) * (1 - np.exp(-10 * t))
     audio *= envelope
-    return audio, sampling_rate
-def generate_speech(text, speed=1.0, emotion="neutral"):
-    """Generate speech from text using VibeVoice model"""
     try:
-        if not text or text.strip() == "":
-            return None, "Please enter some text to convert to speech."
-        if len(text) > 500:
-            text = text[:500]
-            message_note = f"⚠️ Text truncated to 500 characters"
-        else:
-            message_note = ""
-        stats.add_generation(text)
-        model, processor = load_model()
-        if model == "simple":
-            audio, sampling_rate = generate_simple_tone(text)
-            message = f"⚠️ Using simple tone generation (VibeVoice model not available)"
-        else:
-            print(f"🔊 Generating speech for: {text[:50]}...")
-            # Prepare inputs
-            inputs = processor(
                 text=text,
                 return_tensors="pt",
                 sampling_rate=16000,
             )
-            # Move to device
-            device = next(model.parameters()).device
             inputs = {k: v.to(device) for k, v in inputs.items()}
-            # Generate audio
             with torch.no_grad():
-                audio_tensor = model.generate(
                     **inputs,
                     temperature=0.7,
                     do_sample=True,
                 )
-            # Convert to numpy
             audio = audio_tensor.cpu().numpy().squeeze()
-            sampling_rate = 16000
-            # Format success message
-            emotion_icons = {
-                "neutral": "😐",
-                "happy": "😊",
-                "excited": "🎉",
-                "calm": "😌",
-                "professional": "💼"
-            }
-            icon = emotion_icons.get(emotion, "🎵")
-            message = f"{icon} VibeVoice generated {len(text)} characters"
-            if message_note:
-                message += f"<br>{message_note}"
-        # Normalize audio
         max_val = np.max(np.abs(audio))
         if max_val > 0:
-            audio = audio / max_val * 0.95
-        # Apply speed adjustment
         if speed != 1.0:
             from scipy import signal
-            new_length = int(len(audio) / speed)
-            audio = signal.resample(audio, new_length)
-        # Save to temporary file
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
-            scipy.io.wavfile.write(tmp_file.name, sampling_rate, audio.astype(np.float32))
-            success_message = f"""
-            <div style='background: rgba(102, 126, 234, 0.1); padding: 1rem; border-radius: 10px; border-left: 4px solid #667eea; margin: 1rem 0;'>
-                <div style='color: #667eea; font-weight: 600; margin-bottom: 0.5rem;'>✅ {message}</div>
-                <div style='color: rgba(255,255,255,0.8);'>
-                    Audio length: <strong>{len(audio)/sampling_rate:.2f}s</strong> |
-                    Speed: <strong>{speed}x</strong> |
-                    Emotion: <strong>{emotion}</strong>
                 </div>
             </div>
             """
-            return tmp_file.name, success_message
     except Exception as e:
-        print(f"❌ Error generating speech: {e}")
-        try:
-            # Create a fallback audio file
-            silent_audio = np.zeros(16000, dtype=np.float32)
-            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
-                scipy.io.wavfile.write(tmp_file.name, 16000, silent_audio)
-                return tmp_file.name, f"❌ Error: {str(e)[:100]}"
-        except:
-            return None, f"❌ Error: {str(e)[:100]}"
-def update_stats_display():
-    """Update the statistics display"""
-    stats_data = stats.get_stats()
     return f"""
-    <div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 1rem;">
-        <div class="stats-card">
-            <div class="stats-value">{stats_data['total_generations']}</div>
-            <div class="stats-label">Total Generations</div>
         </div>
-        <div class="stats-card">
-            <div class="stats-value">{stats_data['total_chars']}</div>
-            <div class="stats-label">Characters Processed</div>
         </div>
-        <div class="stats-card">
-            <div class="stats-value">{stats_data['avg_chars']:.0f}</div>
-            <div class="stats-label">Avg. Characters</div>
         </div>
-        <div class="stats-card">
-            <div class="stats-value">{stats_data['uptime']}</div>
-            <div class="stats-label">System Uptime</div>
         </div>
     </div>
     """
 # Create the interface
 with gr.Blocks() as demo:
-    # Add CSS as HTML
     gr.HTML(css)
-    # Header Section
-    with gr.Column():
-        gr.HTML("""
-        <div class="header">
-            <h1>🎵 VibeVoice TTS Pro</h1>
-            <p>Transform Text into Natural Speech with Microsoft VibeVoice</p>
-            <div style="display: flex; justify-content: center; gap: 0.5rem; margin-top: 1rem; flex-wrap: wrap;">
-                <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
-                    🎵 Microsoft VibeVoice
-                </span>
-                <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
-                    ⚡ Real-time Generation
-                </span>
-                <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
-                    🎭 Emotional Control
-                </span>
-            </div>
-        </div>
-        """)
-    # Main Content
     with gr.Row():
-        # Left Panel - Input Controls
-        with gr.Column(scale=1):
-            gr.HTML('<div class="glass-card">')
-            gr.Markdown("### 📝 Input Text")
-            text_input = gr.Textbox(
-                label="",
-                placeholder="Type your text here... (Max 500 characters for best results)",
-                lines=6
-            )
-            gr.Markdown("### 🎭 Voice Settings")
-            emotion = gr.Dropdown(
-                label="Voice Emotion",
-                choices=["neutral", "happy", "excited", "calm", "professional"],
-                value="neutral",
-                elem_id="emotion-select"
-            )
-            speed = gr.Slider(
-                minimum=0.5,
-                maximum=2.0,
-                value=1.0,
-                step=0.1,
-                label="Speaking Speed"
-            )
-            # Action Buttons
-            with gr.Row():
-                generate_btn = gr.Button("✨ Generate Speech", variant="primary", elem_id="generate-btn")
-                clear_btn = gr.Button("Clear", variant="secondary")
-            # Quick Actions
-            gr.Markdown("### ⚡ Quick Actions")
-            with gr.Row():
-                quick_test = gr.Button("Test Voice", variant="secondary")
-                quick_clear = gr.Button("Clear Text", variant="secondary")
-            gr.HTML('</div>')
-        # Right Panel - Output Display
         with gr.Column(scale=1):
-            gr.HTML('<div class="glass-card">')
-            gr.Markdown("### 🎧 Generated Audio")
-            gr.HTML('<div class="audio-player">')
-            audio_output = gr.Audio(label="", type="filepath")
-            status_display = gr.HTML(
-                value="<div style='text-align: center; color: rgba(255,255,255,0.7);'>Ready to generate speech...</div>"
-            )
-            gr.HTML('</div>')
-            gr.HTML('</div>')
-    # Bottom Section - Tabs
-    gr.HTML('<div class="glass-card">')
-    with gr.Tabs():
-        with gr.TabItem("📈 Statistics"):
-            stats_display = gr.HTML(value=update_stats_display())
-            refresh_stats = gr.Button("Refresh Stats", variant="secondary")
-        with gr.TabItem("💡 Examples"):
-            gr.Examples(
-                examples=[
-                    ["Hello! Welcome to VibeVoice text-to-speech demonstration."],
-                    ["The quick brown fox jumps over the lazy dog."],
-                    ["Artificial intelligence is transforming our world in amazing ways."],
-                    ["This is a test of the text to speech generation system."],
-                    ["Would you like a cup of coffee or tea this morning?"]
-                ],
-                inputs=text_input,
-                label="Click any example to load it"
-            )
-        with gr.TabItem("ℹ️ About & Settings"):
-            gr.Markdown("""
-            ## 🎵 VibeVoice TTS Pro
-            Powered by **Microsoft VibeVoice-Realtime-0.5B**, a state-of-the-art text-to-speech model.
-            ### Features:
-            - **High-Quality Speech**: Professional-grade voice synthesis
-            - **Real-time Processing**: Fast generation with GPU acceleration
-            - **Emotional Control**: Multiple voice emotions to choose from
-            - **Speed Adjustment**: Control speaking rate from 0.5x to 2.0x
-            ### Tips for Best Results:
-            1. Keep text under **500 characters** for optimal performance
-            2. Try different emotions for varied expressions
-            3. Adjust speed to match your preference
-            4. Use clear, well-punctuated text
-            ### Model Information:
-            - **Model**: VibeVoice-Realtime-0.5B
-            - **Parameters**: 0.5 Billion
-            - **Audio Quality**: 16kHz sampling rate
-            - **Language**: English (optimized)
-            ⚠️ **Note**: First generation may take longer as the model loads.
-            """)
-    gr.HTML('</div>')
-    # Footer
-    gr.HTML("""
-    <div style="text-align: center; margin-top: 2rem; padding: 1.5rem; background: rgba(255,255,255,0.05); border-radius: 15px;">
-        <div style="display: flex; justify-content: center; gap: 2rem; margin-bottom: 1rem; flex-wrap: wrap;">
-            <span style="color: rgba(255,255,255,0.7);">🤖 Microsoft VibeVoice Model</span>
-            <span style="color: rgba(255,255,255,0.7);">⚡ Real-time Processing</span>
-            <span style="color: rgba(255,255,255,0.7);">✨ Beautiful Interface</span>
-        </div>
-        <p style="color: rgba(255,255,255,0.5); font-size: 0.9em;">
-            Made with ❤️ using Transformers & Gradio |
-            <span id="live-time" style="color: #667eea; font-weight: 600;"></span>
-        </p>
-    </div>
-    <script>
-        function updateTime() {
-            const now = new Date();
-            const timeString = now.toLocaleTimeString();
-            document.getElementById('live-time').textContent = timeString;
-        }
-        setInterval(updateTime, 1000);
-        updateTime();
-        // Add keyboard shortcut
-        document.addEventListener('keydown', function(e) {
-            if (e.ctrlKey && e.key === 'Enter') {
-                document.getElementById('generate-btn').click();
-            }
-        });
-    </script>
-    """)
-    # Event Handlers
-    def process_generation(text, emotion_val, speed_val):
-        """Handle speech generation"""
-        if not text or text.strip() == "":
-            return None, "⚠️ Please enter some text first!", update_stats_display()
-        # Show processing message
-        processing_msg = """
-        <div style='background: rgba(102, 126, 234, 0.1); padding: 1rem; border-radius: 10px; border-left: 4px solid #667eea; margin: 1rem 0;'>
-            <div style='color: #667eea; font-weight: 600; margin-bottom: 0.5rem;'>⏳ Generating speech...</div>
-            <div style='color: rgba(255,255,255,0.8);'>Please wait while the model processes your text.</div>
-        </div>
-        """
-        audio_path, status_msg = generate_speech(text, speed_val, emotion_val)
-        stats_html = update_stats_display()
-        return audio_path, status_msg, stats_html
-    def clear_all():
-        """Clear all inputs"""
-        return "", None, """
-        <div style='text-align: center; color: rgba(255,255,255,0.7);'>
-            Cleared. Ready for new input.
-        </div>
-        """, update_stats_display()
-    def test_voice():
-        """Load test text"""
-        return "Hello! This is a demonstration of the VibeVoice text-to-speech system. The voice sounds natural and clear."
     # Connect buttons
     generate_btn.click(
-        fn=process_generation,
-        inputs=[text_input, emotion, speed],
-        outputs=[audio_output, status_display, stats_display]
     )
     clear_btn.click(
-        fn=clear_all,
-        inputs=[],
-        outputs=[text_input, audio_output, status_display, stats_display]
-    )
-    quick_test.click(
-        fn=test_voice,
-        inputs=[],
-        outputs=[text_input]
-    )
-    quick_clear.click(
-        fn=lambda: "",
-        inputs=[],
-        outputs=[text_input]
-    )
-    refresh_stats.click(
-        fn=update_stats_display,
-        inputs=[],
-        outputs=[stats_display]
     )
-    # Initialize
-    demo.load(
-        fn=update_stats_display,
-        inputs=[],
-        outputs=[stats_display]
     )
-# Launch the app
 if __name__ == "__main__":
-    # Load model at startup
-    load_model()
-    demo.launch(
-        debug=True,
-        share=False,
-        server_name="0.0.0.0",
-        server_port=7860
-    )

 import time
 import warnings
 import scipy.io.wavfile
+import sys
+import os
 warnings.filterwarnings("ignore")
+# Suppress asyncio warnings
+if sys.version_info[0] == 3 and sys.version_info[1] >= 8:
+    import asyncio
+    try:
+        asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
+    except:
+        pass
+# CSS with black text input
 css = """
 <style>
+body {
+    background: linear-gradient(135deg, #0f172a 0%, #1e293b 100%);
+    margin: 0;
+    padding: 20px;
+    min-height: 100vh;
+}
 .gradio-container {
+    max-width: 1200px;
     margin: 0 auto !important;
     font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
 }
 .header {
     overflow: hidden;
 }
 .header h1 {
     font-size: 3em;
     margin-bottom: 0.5rem;
     -webkit-background-clip: text;
     -webkit-text-fill-color: transparent;
     font-weight: 800;
 }
+.card {
+    background: rgba(255, 255, 255, 0.1);
+    backdrop-filter: blur(10px);
+    border: 1px solid rgba(255, 255, 255, 0.2);
+    border-radius: 20px;
+    padding: 1.5rem;
+    margin-bottom: 1.5rem;
 }
+.primary-btn {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    border: none;
+    color: white;
+    padding: 0.8rem 1.5rem;
+    border-radius: 50px;
+    font-weight: 600;
+    cursor: pointer;
+    transition: transform 0.3s;
 }
+.primary-btn:hover {
+    transform: scale(1.05);
 }
+/* BLACK TEXT INPUT - MOST IMPORTANT FIX */
 textarea {
+    background: white !important;
+    border: 2px solid #667eea !important;
     border-radius: 15px !important;
+    color: #000000 !important; /* Black text */
     padding: 1rem !important;
+    font-size: 16px !important;
+    width: 100% !important;
+    box-sizing: border-box !important;
 }
 textarea::placeholder {
     color: #666 !important;
 }
+textarea:focus {
+    outline: none !important;
+    box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.3) !important;
 }
+.slider {
+    width: 100%;
 }
+.stats {
+    display: grid;
+    grid-template-columns: repeat(2, 1fr);
+    gap: 1rem;
+    margin-top: 1rem;
 }
+.stat-box {
+    background: rgba(255, 255, 255, 0.08);
+    padding: 1rem;
+    border-radius: 15px;
+    text-align: center;
 }
+.stat-value {
+    font-size: 2em;
+    font-weight: bold;
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
 }
+.stat-label {
+    color: rgba(255, 255, 255, 0.7);
+    font-size: 0.8em;
+    text-transform: uppercase;
 }
 .audio-player {
+    background: rgba(255, 255, 255, 0.05);
+    border-radius: 15px;
+    padding: 1.5rem;
+    margin-top: 1rem;
 }
+.tabs {
+    background: rgba(255, 255, 255, 0.05);
+    border-radius: 15px;
+    padding: 0.5rem;
+    margin-top: 1rem;
 }
+.tab-btn {
+    background: transparent;
+    border: none;
+    color: rgba(255, 255, 255, 0.7);
+    padding: 0.5rem 1rem;
+    border-radius: 10px;
+    cursor: pointer;
 }
+.tab-btn.selected {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    color: white;
 }
 </style>
 """
+# Load model function
+def load_vibevoice_model():
+    """Load the VibeVoice model"""
+    print("Loading VibeVoice model...")
+    try:
+        # Direct import as specified
+        from transformers import VibeVoiceStreamingForConditionalGenerationInference, AutoProcessor
+        model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
+            "microsoft/VibeVoice-Realtime-0.5B",
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            device_map="auto"
+        )
+        processor = AutoProcessor.from_pretrained("microsoft/VibeVoice-Realtime-0.5B")
+        print("✅ VibeVoice model loaded successfully!")
+        return model, processor
+    except Exception as e:
+        print(f"❌ Error loading VibeVoice model: {e}")
+        print("⚠️ Using fallback tone generator")
+        return None, None
+# Load model at startup
+MODEL, PROCESSOR = load_vibevoice_model()
+# Stats tracker
+class Stats:
     def __init__(self):
+        self.count = 0
+        self.chars = 0
+        self.start = time.time()
+    def add(self, text):
+        self.count += 1
+        self.chars += len(text)
+    def get(self):
+        uptime = time.time() - self.start
+        hours = int(uptime // 3600)
+        minutes = int((uptime % 3600) // 60)
         return {
+            'count': self.count,
+            'chars': self.chars,
+            'avg': self.chars // max(self.count, 1),
+            'uptime': f"{hours}h {minutes}m"
         }
+stats = Stats()
+def create_fallback_audio(text):
+    """Create simple audio when model fails"""
+    duration = min(len(text) * 0.05, 3)
+    sr = 16000
+    t = np.linspace(0, duration, int(sr * duration))
+    freq = 220 + (len(text) % 300)
+    audio = 0.5 * np.sin(2 * np.pi * freq * t)
     envelope = np.exp(-2 * t) * (1 - np.exp(-10 * t))
     audio *= envelope
+    return audio, sr
+def generate_speech(text, speed=1.0):
+    """Main function to generate speech"""
+    if not text or not text.strip():
+        return None, "⚠️ Please enter text"
+    # Limit text length
+    if len(text) > 500:
+        text = text[:500]
+        note = " (truncated to 500 chars)"
+    else:
+        note = ""
+    stats.add(text)
     try:
+        if MODEL and PROCESSOR:
+            # Use VibeVoice model
+            inputs = PROCESSOR(
                 text=text,
                 return_tensors="pt",
                 sampling_rate=16000,
             )
+            device = next(MODEL.parameters()).device
             inputs = {k: v.to(device) for k, v in inputs.items()}
             with torch.no_grad():
+                audio_tensor = MODEL.generate(
                     **inputs,
                     temperature=0.7,
                     do_sample=True,
                 )
             audio = audio_tensor.cpu().numpy().squeeze()
+            sr = 16000
+            source = "🎵 VibeVoice"
+        else:
+            # Fallback
+            audio, sr = create_fallback_audio(text)
+            source = "⚠️ Fallback Tone"
+        # Normalize
         max_val = np.max(np.abs(audio))
         if max_val > 0:
+            audio = audio / max_val * 0.9
+        # Adjust speed
         if speed != 1.0:
             from scipy import signal
+            new_len = int(len(audio) / speed)
+            audio = signal.resample(audio, new_len)
+        # Save to file
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+            scipy.io.wavfile.write(f.name, sr, audio.astype(np.float32))
+            duration = len(audio) / sr
+            message = f"""
+            <div style='background: rgba(102,126,234,0.1); padding: 1rem; border-radius: 10px;'>
+                <div style='color: #667eea; font-weight: bold;'>✅ {source}</div>
+                <div style='color: white; margin-top: 0.5rem;'>
+                    Generated: {len(text)} chars{note}<br>
+                    Duration: {duration:.2f}s | Speed: {speed}x
                 </div>
             </div>
             """
+            return f.name, message
     except Exception as e:
+        print(f"Generation error: {e}")
+        return None, f"❌ Error: {str(e)[:100]}"
+def get_stats_html():
+    """Generate stats display"""
+    data = stats.get()
     return f"""
+    <div class="stats">
+        <div class="stat-box">
+            <div class="stat-value">{data['count']}</div>
+            <div class="stat-label">Generations</div>
         </div>
+        <div class="stat-box">
+            <div class="stat-value">{data['chars']}</div>
+            <div class="stat-label">Characters</div>
         </div>
+        <div class="stat-box">
+            <div class="stat-value">{data['avg']}</div>
+            <div class="stat-label">Avg Length</div>
         </div>
+        <div class="stat-box">
+            <div class="stat-value">{data['uptime']}</div>
+            <div class="stat-label">Uptime</div>
         </div>
     </div>
     """
 # Create the interface
 with gr.Blocks() as demo:
+    # Add CSS
     gr.HTML(css)
+    # Header
+    gr.HTML("""
+    <div class="header">
+        <h1>🎵 VibeVoice TTS</h1>
+        <p>Microsoft VibeVoice Text-to-Speech</p>
+    </div>
+    """)
+    # Main layout
     with gr.Row():
+        # Left column - Input
+        with gr.Column(scale=2):
+            with gr.Column(elem_id="input-card"):
+                gr.Markdown("### 📝 Enter Text")
+                text_input = gr.Textbox(
+                    label="",
+                    placeholder="Type your text here... (Max 500 characters)",
+                    lines=5
+                )
+                gr.Markdown("### ⚙️ Settings")
+                speed = gr.Slider(
+                    minimum=0.5,
+                    maximum=2.0,
+                    value=1.0,
+                    step=0.1,
+                    label="Speaking Speed"
+                )
+                with gr.Row():
+                    generate_btn = gr.Button("✨ Generate Speech", variant="primary")
+                    clear_btn = gr.Button("Clear", variant="secondary")
+        # Right column - Output
         with gr.Column(scale=1):
+            with gr.Column(elem_id="output-card"):
+                gr.Markdown("### 🎧 Output")
+                audio_output = gr.Audio(type="filepath", label="")
+                status = gr.HTML("Ready...")
+    # Stats
+    with gr.Column(elem_id="stats-card"):
+        gr.Markdown("### 📊 Statistics")
+        stats_display = gr.HTML(get_stats_html())
+        refresh_btn = gr.Button("🔄 Refresh", variant="secondary", size="sm")
+    # Examples
+    with gr.Column(elem_id="examples-card"):
+        gr.Markdown("### 💡 Examples")
+        gr.Examples(
+            examples=[
+                ["Hello! Welcome to VibeVoice TTS."],
+                ["The quick brown fox jumps over the lazy dog."],
+                ["This is a test of the text to speech system."],
+                ["Artificial intelligence is amazing technology."]
+            ],
+            inputs=text_input,
+            label="Click to try"
+        )
+    # About
+    with gr.Column(elem_id="about-card"):
+        gr.Markdown("### ℹ️ About")
+        gr.Markdown("""
+        **VibeVoice TTS** uses Microsoft's VibeVoice-Realtime-0.5B model.
+        **Features:**
+        - High-quality speech synthesis
+        - Real-time processing
+        - Adjustable speaking speed
+        **Tips:**
+        - Keep text under 500 characters
+        - Use clear, well-punctuated text
+        - First generation may take longer
+        """)
+    # Event handlers
+    def process_text(text, speed_val):
+        if not text:
+            return None, "Enter text first", get_stats_html()
+        audio, msg = generate_speech(text, speed_val)
+        return audio, msg, get_stats_html()
+    def clear():
+        return "", None, "Cleared", get_stats_html()
     # Connect buttons
     generate_btn.click(
+        process_text,
+        [text_input, speed],
+        [audio_output, status, stats_display]
     )
     clear_btn.click(
+        clear,
+        [],
+        [text_input, audio_output, status, stats_display]
     )
+    refresh_btn.click(
+        get_stats_html,
+        [],
+        [stats_display]
     )
+# Run the app
 if __name__ == "__main__":
+    # Clean shutdown handling
+    try:
+        demo.launch(
+            server_name="0.0.0.0",
+            server_port=7860,
+            show_error=True,
+            quiet=True  # Reduce console noise
+        )
+    except KeyboardInterrupt:
+        print("\nShutting down...")
+    except Exception as e:
+        print(f"Error: {e}")