Spaces:

DevNumb
/

TextTOVoiceConv

Sleeping

App Files Files Community

DevNumb commited on Dec 5, 2025

Commit

d94ebbf

verified ·

1 Parent(s): 9de60e0

Update app.py

Browse files

Files changed (1) hide show

app.py +194 -66

app.py CHANGED Viewed

@@ -7,13 +7,16 @@ import warnings
 import scipy.io.wavfile
 warnings.filterwarnings("ignore")
-# Inline CSS for Gradio 3.x
 css = """
 <style>
 .gradio-container {
     max-width: 1200px !important;
     margin: 0 auto !important;
     font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
 }
 .header {
@@ -108,11 +111,12 @@ css = """
     left: 100%;
 }
 textarea {
-    background: rgba(255, 255, 255, 0.05) !important;
-    border: 2px solid rgba(255, 255, 255, 0.1) !important;
     border-radius: 15px !important;
-    color: white !important;
     padding: 1rem !important;
     font-size: 1.1em !important;
     transition: all 0.3s ease !important;
@@ -120,8 +124,14 @@ textarea {
 textarea:focus {
     border-color: #667eea !important;
-    box-shadow: 0 0 20px rgba(102, 126, 234, 0.3) !important;
-    background: rgba(255, 255, 255, 0.08) !important;
 }
 .stats-card {
@@ -162,6 +172,7 @@ textarea:focus {
     border-radius: 10px !important;
     margin: 0 0.25rem !important;
     transition: all 0.3s ease !important;
 }
 .tab-nav button.selected {
@@ -207,38 +218,71 @@ input[type="range"]::-webkit-slider-thumb {
     transform: translateY(-2px) !important;
 }
-#component-0 {
-    min-height: 100vh;
-    background: linear-gradient(135deg, #0f172a 0%, #1e293b 100%);
-    padding: 2rem;
 }
 </style>
 """
 # Global variable for model
 _tts_model = None
 def load_model():
-    """Load the TTS model once"""
-    global _tts_model
     if _tts_model is None:
         print("🚀 Loading VibeVoice model...")
         try:
-            # Try using pipeline first
-            from transformers import pipeline
-            _tts_model = pipeline(
-                "text-to-speech",
-                model="microsoft/VibeVoice-Realtime-0.5B",
-                device=0 if torch.cuda.is_available() else -1
             )
-            print("✅ Model loaded successfully using pipeline!")
         except Exception as e:
-            print(f"⚠️ Pipeline loading failed: {e}")
             print("⚠️ Falling back to simple tone generation")
             _tts_model = "simple"
-    return _tts_model
 # Stats tracking
 class TTSStats:
@@ -280,26 +324,50 @@ def generate_simple_tone(text, sampling_rate=16000):
     return audio, sampling_rate
 def generate_speech(text, speed=1.0, emotion="neutral"):
-    """Generate speech from text"""
     try:
         if not text or text.strip() == "":
             return None, "Please enter some text to convert to speech."
-        if len(text) > 1000:
-            text = text[:1000]
         stats.add_generation(text)
-        model = load_model()
         if model == "simple":
             audio, sampling_rate = generate_simple_tone(text)
-            message = f"⚠️ Using simple tone generation (model not available)"
         else:
-            print(f"Generating speech for: {text[:50]}...")
-            result = model(text)
-            audio = result["audio"]
-            sampling_rate = result["sampling_rate"]
             emotion_icons = {
                 "neutral": "😐",
                 "happy": "😊",
@@ -308,7 +376,9 @@ def generate_speech(text, speed=1.0, emotion="neutral"):
                 "professional": "💼"
             }
             icon = emotion_icons.get(emotion, "🎵")
-            message = f"{icon} Generated {len(text)} characters with {emotion} tone"
         # Normalize audio
         max_val = np.max(np.abs(audio))
@@ -329,16 +399,18 @@ def generate_speech(text, speed=1.0, emotion="neutral"):
             <div style='background: rgba(102, 126, 234, 0.1); padding: 1rem; border-radius: 10px; border-left: 4px solid #667eea; margin: 1rem 0;'>
                 <div style='color: #667eea; font-weight: 600; margin-bottom: 0.5rem;'>✅ {message}</div>
                 <div style='color: rgba(255,255,255,0.8);'>
-                    Length: <strong>{len(audio)/sampling_rate:.1f}s</strong> |
-                    Speed: <strong>{speed}x</strong>
                 </div>
             </div>
             """
             return tmp_file.name, success_message
     except Exception as e:
-        print(f"Error generating speech: {e}")
         try:
             silent_audio = np.zeros(16000, dtype=np.float32)
             with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
                 scipy.io.wavfile.write(tmp_file.name, 16000, silent_audio)
@@ -370,7 +442,7 @@ def update_stats_display():
     </div>
     """
-# Create the interface with proper Gradio 3.x syntax
 with gr.Blocks() as demo:
     # Add CSS as HTML
     gr.HTML(css)
@@ -379,17 +451,17 @@ with gr.Blocks() as demo:
     with gr.Column():
         gr.HTML("""
         <div class="header">
-            <h1>🎵 VibeVoice TTS</h1>
-            <p>Transform Text into Natural Speech</p>
             <div style="display: flex; justify-content: center; gap: 0.5rem; margin-top: 1rem; flex-wrap: wrap;">
                 <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
-                    🤖 AI Powered
                 </span>
                 <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
-                    ⚡ Real-time
                 </span>
                 <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
-                    🎭 Emotional Voices
                 </span>
             </div>
         </div>
@@ -404,7 +476,7 @@ with gr.Blocks() as demo:
             text_input = gr.Textbox(
                 label="",
-                placeholder="Enter your text here... (Max 1000 characters)",
                 lines=6
             )
@@ -413,7 +485,8 @@ with gr.Blocks() as demo:
             emotion = gr.Dropdown(
                 label="Voice Emotion",
                 choices=["neutral", "happy", "excited", "calm", "professional"],
-                value="neutral"
             )
             speed = gr.Slider(
@@ -426,7 +499,7 @@ with gr.Blocks() as demo:
             # Action Buttons
             with gr.Row():
-                generate_btn = gr.Button("✨ Generate Speech", variant="primary")
                 clear_btn = gr.Button("Clear", variant="secondary")
             # Quick Actions
@@ -461,61 +534,105 @@ with gr.Blocks() as demo:
         with gr.TabItem("💡 Examples"):
             gr.Examples(
                 examples=[
-                    ["Hello, welcome to VibeVoice text-to-speech!"],
-                    ["This is a demonstration of AI speech synthesis."],
-                    ["The weather is beautiful today."],
-                    ["Artificial intelligence is amazing technology."],
-                    ["Please enjoy this text to speech demonstration."]
                 ],
                 inputs=text_input,
-                label="Click any example to try it"
             )
-        with gr.TabItem("ℹ️ About"):
             gr.Markdown("""
-            ## About VibeVoice TTS
-            This application converts text into speech using AI technology.
             ### Features:
-            - **AI-Powered**: Uses advanced machine learning models
-            - **Multiple Emotions**: Choose different voice tones
-            - **Adjustable Speed**: Control speaking rate
-            - **Real-time**: Fast generation
-            ### Tips:
-            - Keep text under 500 characters for best results
-            - Try different emotions for varied expressions
-            - Adjust speed to match your preference
-            ⚠️ **Note**: If the model fails to load, a simple tone generator will be used as fallback.
             """)
     gr.HTML('</div>')
     # Footer
     gr.HTML("""
-    <div style="text-align: center; margin-top: 2rem; padding: 1rem; background: rgba(255,255,255,0.05); border-radius: 15px;">
         <p style="color: rgba(255,255,255,0.5); font-size: 0.9em;">
-            Made with ❤️ using Gradio & Transformers
         </p>
     </div>
     """)
     # Event Handlers
     def process_generation(text, emotion_val, speed_val):
         if not text or text.strip() == "":
             return None, "⚠️ Please enter some text first!", update_stats_display()
         audio_path, status_msg = generate_speech(text, speed_val, emotion_val)
         stats_html = update_stats_display()
         return audio_path, status_msg, stats_html
     def clear_all():
-        return "", None, "Cleared. Ready for new input.", update_stats_display()
     def test_voice():
-        return "Hello! This is a test of the VibeVoice text-to-speech system."
     # Connect buttons
     generate_btn.click(
@@ -547,11 +664,22 @@ with gr.Blocks() as demo:
         inputs=[],
         outputs=[stats_display]
     )
 # Launch the app
 if __name__ == "__main__":
     demo.launch(
         debug=True,
         share=False,
-        server_name="0.0.0.0"
     )

 import scipy.io.wavfile
 warnings.filterwarnings("ignore")
+# Inline CSS with black text input
 css = """
 <style>
 .gradio-container {
     max-width: 1200px !important;
     margin: 0 auto !important;
     font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+    background: linear-gradient(135deg, #0f172a 0%, #1e293b 100%);
+    min-height: 100vh;
+    padding: 2rem;
 }
 .header {
     left: 100%;
 }
+/* BLACK TEXT INPUT */
 textarea {
+    background: rgba(255, 255, 255, 0.95) !important;
+    border: 2px solid rgba(102, 126, 234, 0.3) !important;
     border-radius: 15px !important;
+    color: #1e293b !important; /* Dark text color */
     padding: 1rem !important;
     font-size: 1.1em !important;
     transition: all 0.3s ease !important;
 textarea:focus {
     border-color: #667eea !important;
+    box-shadow: 0 0 20px rgba(102, 126, 234, 0.5) !important;
+    background: white !important;
+    color: #1e293b !important;
+}
+textarea::placeholder {
+    color: #666 !important;
+    opacity: 0.8 !important;
 }
 .stats-card {
     border-radius: 10px !important;
     margin: 0 0.25rem !important;
     transition: all 0.3s ease !important;
+    color: rgba(255, 255, 255, 0.7) !important;
 }
 .tab-nav button.selected {
     transform: translateY(-2px) !important;
 }
+.dropdown {
+    background: rgba(255, 255, 255, 0.1) !important;
+    color: white !important;
+    border-radius: 10px !important;
+}
+.dropdown option {
+    background: #1e293b !important;
+    color: white !important;
+}
+.markdown {
+    color: rgba(255, 255, 255, 0.9) !important;
+}
+.markdown h1, .markdown h2, .markdown h3 {
+    color: white !important;
 }
 </style>
 """
 # Global variable for model
 _tts_model = None
+_tts_processor = None
 def load_model():
+    """Load the VibeVoice model directly"""
+    global _tts_model, _tts_processor
     if _tts_model is None:
         print("🚀 Loading VibeVoice model...")
         try:
+            # Try direct import first
+            from transformers import VibeVoiceStreamingForConditionalGenerationInference, AutoProcessor
+            _tts_model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
+                "microsoft/VibeVoice-Realtime-0.5B",
+                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+                device_map="auto"
             )
+            _tts_processor = AutoProcessor.from_pretrained("microsoft/VibeVoice-Realtime-0.5B")
+            print("✅ VibeVoice model loaded successfully!")
+        except ImportError as e:
+            print(f"❌ Import error: {e}")
+            print("⚠️ Trying alternative import...")
+            try:
+                # Alternative import
+                from transformers import AutoModelForTextToSpeech, AutoProcessor
+                _tts_model = AutoModelForTextToSpeech.from_pretrained(
+                    "microsoft/VibeVoice-Realtime-0.5B",
+                    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+                    device_map="auto"
+                )
+                _tts_processor = AutoProcessor.from_pretrained("microsoft/VibeVoice-Realtime-0.5B")
+                print("✅ Model loaded with AutoModelForTextToSpeech!")
+            except Exception as e2:
+                print(f"❌ All imports failed: {e2}")
+                print("⚠️ Falling back to simple tone generation")
+                _tts_model = "simple"
         except Exception as e:
+            print(f"❌ Model loading error: {e}")
             print("⚠️ Falling back to simple tone generation")
             _tts_model = "simple"
+    return _tts_model, _tts_processor
 # Stats tracking
 class TTSStats:
     return audio, sampling_rate
 def generate_speech(text, speed=1.0, emotion="neutral"):
+    """Generate speech from text using VibeVoice model"""
     try:
         if not text or text.strip() == "":
             return None, "Please enter some text to convert to speech."
+        if len(text) > 500:
+            text = text[:500]
+            message_note = f"⚠️ Text truncated to 500 characters"
+        else:
+            message_note = ""
         stats.add_generation(text)
+        model, processor = load_model()
         if model == "simple":
             audio, sampling_rate = generate_simple_tone(text)
+            message = f"⚠️ Using simple tone generation (VibeVoice model not available)"
         else:
+            print(f"🔊 Generating speech for: {text[:50]}...")
+            # Prepare inputs
+            inputs = processor(
+                text=text,
+                return_tensors="pt",
+                sampling_rate=16000,
+            )
+            # Move to device
+            device = next(model.parameters()).device
+            inputs = {k: v.to(device) for k, v in inputs.items()}
+            # Generate audio
+            with torch.no_grad():
+                audio_tensor = model.generate(
+                    **inputs,
+                    temperature=0.7,
+                    do_sample=True,
+                )
+            # Convert to numpy
+            audio = audio_tensor.cpu().numpy().squeeze()
+            sampling_rate = 16000
+            # Format success message
             emotion_icons = {
                 "neutral": "😐",
                 "happy": "😊",
                 "professional": "💼"
             }
             icon = emotion_icons.get(emotion, "🎵")
+            message = f"{icon} VibeVoice generated {len(text)} characters"
+            if message_note:
+                message += f"<br>{message_note}"
         # Normalize audio
         max_val = np.max(np.abs(audio))
             <div style='background: rgba(102, 126, 234, 0.1); padding: 1rem; border-radius: 10px; border-left: 4px solid #667eea; margin: 1rem 0;'>
                 <div style='color: #667eea; font-weight: 600; margin-bottom: 0.5rem;'>✅ {message}</div>
                 <div style='color: rgba(255,255,255,0.8);'>
+                    Audio length: <strong>{len(audio)/sampling_rate:.2f}s</strong> |
+                    Speed: <strong>{speed}x</strong> |
+                    Emotion: <strong>{emotion}</strong>
                 </div>
             </div>
             """
             return tmp_file.name, success_message
     except Exception as e:
+        print(f"❌ Error generating speech: {e}")
         try:
+            # Create a fallback audio file
             silent_audio = np.zeros(16000, dtype=np.float32)
             with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
                 scipy.io.wavfile.write(tmp_file.name, 16000, silent_audio)
     </div>
     """
+# Create the interface
 with gr.Blocks() as demo:
     # Add CSS as HTML
     gr.HTML(css)
     with gr.Column():
         gr.HTML("""
         <div class="header">
+            <h1>🎵 VibeVoice TTS Pro</h1>
+            <p>Transform Text into Natural Speech with Microsoft VibeVoice</p>
             <div style="display: flex; justify-content: center; gap: 0.5rem; margin-top: 1rem; flex-wrap: wrap;">
                 <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
+                    🎵 Microsoft VibeVoice
                 </span>
                 <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
+                    ⚡ Real-time Generation
                 </span>
                 <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
+                    🎭 Emotional Control
                 </span>
             </div>
         </div>
             text_input = gr.Textbox(
                 label="",
+                placeholder="Type your text here... (Max 500 characters for best results)",
                 lines=6
             )
             emotion = gr.Dropdown(
                 label="Voice Emotion",
                 choices=["neutral", "happy", "excited", "calm", "professional"],
+                value="neutral",
+                elem_id="emotion-select"
             )
             speed = gr.Slider(
             # Action Buttons
             with gr.Row():
+                generate_btn = gr.Button("✨ Generate Speech", variant="primary", elem_id="generate-btn")
                 clear_btn = gr.Button("Clear", variant="secondary")
             # Quick Actions
         with gr.TabItem("💡 Examples"):
             gr.Examples(
                 examples=[
+                    ["Hello! Welcome to VibeVoice text-to-speech demonstration."],
+                    ["The quick brown fox jumps over the lazy dog."],
+                    ["Artificial intelligence is transforming our world in amazing ways."],
+                    ["This is a test of the text to speech generation system."],
+                    ["Would you like a cup of coffee or tea this morning?"]
                 ],
                 inputs=text_input,
+                label="Click any example to load it"
             )
+        with gr.TabItem("ℹ️ About & Settings"):
             gr.Markdown("""
+            ## 🎵 VibeVoice TTS Pro
+            Powered by **Microsoft VibeVoice-Realtime-0.5B**, a state-of-the-art text-to-speech model.
             ### Features:
+            - **High-Quality Speech**: Professional-grade voice synthesis
+            - **Real-time Processing**: Fast generation with GPU acceleration
+            - **Emotional Control**: Multiple voice emotions to choose from
+            - **Speed Adjustment**: Control speaking rate from 0.5x to 2.0x
+            ### Tips for Best Results:
+            1. Keep text under **500 characters** for optimal performance
+            2. Try different emotions for varied expressions
+            3. Adjust speed to match your preference
+            4. Use clear, well-punctuated text
+            ### Model Information:
+            - **Model**: VibeVoice-Realtime-0.5B
+            - **Parameters**: 0.5 Billion
+            - **Audio Quality**: 16kHz sampling rate
+            - **Language**: English (optimized)
+            ⚠️ **Note**: First generation may take longer as the model loads.
             """)
     gr.HTML('</div>')
     # Footer
     gr.HTML("""
+    <div style="text-align: center; margin-top: 2rem; padding: 1.5rem; background: rgba(255,255,255,0.05); border-radius: 15px;">
+        <div style="display: flex; justify-content: center; gap: 2rem; margin-bottom: 1rem; flex-wrap: wrap;">
+            <span style="color: rgba(255,255,255,0.7);">🤖 Microsoft VibeVoice Model</span>
+            <span style="color: rgba(255,255,255,0.7);">⚡ Real-time Processing</span>
+            <span style="color: rgba(255,255,255,0.7);">✨ Beautiful Interface</span>
+        </div>
         <p style="color: rgba(255,255,255,0.5); font-size: 0.9em;">
+            Made with ❤️ using Transformers & Gradio |
+            <span id="live-time" style="color: #667eea; font-weight: 600;"></span>
         </p>
     </div>
+    <script>
+        function updateTime() {
+            const now = new Date();
+            const timeString = now.toLocaleTimeString();
+            document.getElementById('live-time').textContent = timeString;
+        }
+        setInterval(updateTime, 1000);
+        updateTime();
+        // Add keyboard shortcut
+        document.addEventListener('keydown', function(e) {
+            if (e.ctrlKey && e.key === 'Enter') {
+                document.getElementById('generate-btn').click();
+            }
+        });
+    </script>
     """)
     # Event Handlers
     def process_generation(text, emotion_val, speed_val):
+        """Handle speech generation"""
         if not text or text.strip() == "":
             return None, "⚠️ Please enter some text first!", update_stats_display()
+        # Show processing message
+        processing_msg = """
+        <div style='background: rgba(102, 126, 234, 0.1); padding: 1rem; border-radius: 10px; border-left: 4px solid #667eea; margin: 1rem 0;'>
+            <div style='color: #667eea; font-weight: 600; margin-bottom: 0.5rem;'>⏳ Generating speech...</div>
+            <div style='color: rgba(255,255,255,0.8);'>Please wait while the model processes your text.</div>
+        </div>
+        """
         audio_path, status_msg = generate_speech(text, speed_val, emotion_val)
         stats_html = update_stats_display()
         return audio_path, status_msg, stats_html
     def clear_all():
+        """Clear all inputs"""
+        return "", None, """
+        <div style='text-align: center; color: rgba(255,255,255,0.7);'>
+            Cleared. Ready for new input.
+        </div>
+        """, update_stats_display()
     def test_voice():
+        """Load test text"""
+        return "Hello! This is a demonstration of the VibeVoice text-to-speech system. The voice sounds natural and clear."
     # Connect buttons
     generate_btn.click(
         inputs=[],
         outputs=[stats_display]
     )
+    # Initialize
+    demo.load(
+        fn=update_stats_display,
+        inputs=[],
+        outputs=[stats_display]
+    )
 # Launch the app
 if __name__ == "__main__":
+    # Load model at startup
+    load_model()
     demo.launch(
         debug=True,
         share=False,
+        server_name="0.0.0.0",
+        server_port=7860
     )