Spaces:

DevNumb
/

TextTOVoiceConv

Sleeping

App Files Files Community

DevNumb commited on Dec 5, 2025

Commit

12fa800

verified ·

1 Parent(s): c94ed7a

Update app.py

Browse files

Files changed (1) hide show

app.py +113 -192

app.py CHANGED Viewed

@@ -6,13 +6,6 @@ import time
 import warnings
 warnings.filterwarnings("ignore")
-# Try to import the pipeline
-try:
-    from transformers import pipeline
-    HAS_TRANSFORMERS = True
-except ImportError:
-    HAS_TRANSFORMERS = False
 # Custom CSS for beautiful UI
 custom_css = """
 .gradio-container {
@@ -181,39 +174,6 @@ custom_css = """
     border: 2px solid rgba(255, 255, 255, 0.1) !important;
 }
-.progress-container {
-    margin: 1rem 0;
-}
-.progress-bar {
-    height: 6px;
-    background: rgba(255, 255, 255, 0.1);
-    border-radius: 10px;
-    overflow: hidden;
-    position: relative;
-}
-.progress-fill {
-    height: 100%;
-    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-    width: 0%;
-    border-radius: 10px;
-    transition: width 0.3s ease;
-    position: relative;
-}
-.progress-fill::after {
-    content: '';
-    position: absolute;
-    top: 0;
-    left: 0;
-    right: 0;
-    bottom: 0;
-    background: linear-gradient(90deg, transparent, rgba(255,255,255,0.4), transparent);
-    animation: shimmer 2s infinite;
-}
-/* Custom slider */
 .custom-slider .gr-slider {
     background: rgba(255, 255, 255, 0.1) !important;
     height: 8px !important;
@@ -246,42 +206,31 @@ custom_css = """
 }
 """
-# Initialize model
-@gr.cache_resource
 def load_model():
-    print("🚀 Loading VibeVoice model...")
-    try:
-        if HAS_TRANSFORMERS:
-            # Use the pipeline API which is more stable
-            pipe = pipeline(
                 "text-to-speech",
                 model="microsoft/VibeVoice-Realtime-0.5B",
-                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                 device=0 if torch.cuda.is_available() else -1
             )
             print("✅ Model loaded successfully using pipeline!")
-            return pipe
-        else:
-            print("❌ Transformers not available")
-            return None
-    except Exception as e:
-        print(f"❌ Error loading model: {e}")
-        # Try alternative import
-        try:
-            from transformers import VitsModel, AutoTokenizer
-            print("⚠️ Trying alternative model loading...")
-            model = VitsModel.from_pretrained(
-                "microsoft/VibeVoice-Realtime-0.5B",
-                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
-            )
-            tokenizer = AutoTokenizer.from_pretrained("microsoft/VibeVoice-Realtime-0.5B")
-            return {"model": model, "tokenizer": tokenizer}
-        except Exception as e2:
-            print(f"❌ Alternative loading also failed: {e2}")
-            return None
-# Initialize model
-model_pipe = load_model()
 # Stats tracking
 class TTSStats:
@@ -307,43 +256,62 @@ class TTSStats:
 stats = TTSStats()
 def generate_speech(text, speed=1.0, emotion="neutral"):
-    """Generate speech from text using the pipeline"""
     try:
         if not text or text.strip() == "":
             return None, "Please enter some text to convert to speech."
         if len(text) > 1000:
             text = text[:1000]
-            gr.Warning("Text truncated to 1000 characters for better performance.")
         # Update stats
         stats.add_generation(text)
-        if model_pipe is None:
-            return None, "Model not loaded. Please check the logs."
-        # Generate speech
-        print(f"Generating speech for: {text[:50]}...")
-        if isinstance(model_pipe, dict):
-            # Alternative model loading
-            from scipy.io.wavfile import write
-            import io
-            inputs = model_pipe["tokenizer"](text, return_tensors="pt")
-            with torch.no_grad():
-                output = model_pipe["model"](**inputs)
-            audio = output.waveform.squeeze().cpu().numpy()
-            sampling_rate = model_pipe["model"].config.sampling_rate
         else:
-            # Pipeline API
-            result = model_pipe(text)
             audio = result["audio"]
             sampling_rate = result["sampling_rate"]
         # Normalize audio
         audio = audio / np.max(np.abs(audio)) * 0.95
@@ -359,29 +327,29 @@ def generate_speech(text, speed=1.0, emotion="neutral"):
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             scipy.io.wavfile.write(tmp_file.name, sampling_rate, audio.astype(np.float32))
-            message = f"""
             <div style='background: rgba(102, 126, 234, 0.1); padding: 1rem; border-radius: 10px; border-left: 4px solid #667eea;'>
-                <div style='color: #667eea; font-weight: 600; margin-bottom: 0.5rem;'>✅ Generation Complete!</div>
                 <div style='color: rgba(255,255,255,0.8);'>
-                    Generated <strong>{len(text)}</strong> characters<br>
-                    Emotion: <strong>{emotion}</strong> | Speed: <strong>{speed}x</strong><br>
-                    Duration: <strong>{len(audio)/sampling_rate:.1f}s</strong>
                 </div>
             </div>
             """
-            return tmp_file.name, message
     except Exception as e:
         print(f"Error generating speech: {e}")
-        # Create a simple fallback audio
         try:
             import scipy.io.wavfile
             silent_audio = np.zeros(16000, dtype=np.float32)
             with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
                 scipy.io.wavfile.write(tmp_file.name, 16000, silent_audio)
-                return tmp_file.name, f"❌ Error: {str(e)}. Generated silent audio as fallback."
         except:
-            return None, f"❌ Error: {str(e)}"
 def update_stats_display():
     """Update the statistics display"""
@@ -409,12 +377,7 @@ def update_stats_display():
 # Create the interface
 with gr.Blocks(
-    title="🎵 VibeVoice Pro - AI Text to Speech",
-    theme=gr.themes.Soft(
-        primary_hue="violet",
-        secondary_hue="purple",
-        neutral_hue="slate"
-    ),
     css=custom_css
 ) as demo:
@@ -422,17 +385,17 @@ with gr.Blocks(
     with gr.Column(elem_classes="header"):
         gr.HTML("""
         <div style="text-align: center;">
-            <h1>🎵 VibeVoice Pro</h1>
-            <p style="font-size: 1.2em; opacity: 0.9;">Transform Text into Natural, Expressive Speech</p>
             <div style="display: flex; justify-content: center; gap: 0.5rem; margin-top: 1rem; flex-wrap: wrap;">
                 <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
-                    🤖 Powered by Microsoft VibeVoice
                 </span>
                 <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
-                    ⚡ Real-time Generation
                 </span>
                 <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
-                    🎭 Multiple Emotions
                 </span>
             </div>
         </div>
@@ -442,13 +405,12 @@ with gr.Blocks(
     with gr.Row():
         # Left Panel - Input Controls
         with gr.Column(scale=1, elem_classes="glass-card"):
-            gr.Markdown("### 📝 Text Input")
             text_input = gr.Textbox(
                 label="",
-                placeholder="✨ Enter your text here... (Maximum 1000 characters)",
                 lines=6,
-                max_lines=10,
                 elem_classes="fancy-textbox"
             )
@@ -458,8 +420,7 @@ with gr.Blocks(
                 emotion = gr.Dropdown(
                     label="Voice Emotion",
                     choices=["neutral", "happy", "excited", "calm", "professional"],
-                    value="neutral",
-                    info="Select the emotional tone"
                 )
             with gr.Row():
@@ -468,8 +429,7 @@ with gr.Blocks(
                     maximum=2.0,
                     value=1.0,
                     step=0.1,
-                    label="🎚️ Speaking Speed",
-                    info="Adjust the speaking rate",
                     elem_classes="custom-slider"
                 )
@@ -478,12 +438,10 @@ with gr.Blocks(
                 generate_btn = gr.Button(
                     "✨ Generate Speech",
                     variant="primary",
-                    size="lg",
-                    elem_classes="glow-button",
-                    scale=2
                 )
                 clear_btn = gr.Button(
-                    "🗑️ Clear All",
                     variant="secondary",
                     elem_classes="secondary-button"
                 )
@@ -491,8 +449,8 @@ with gr.Blocks(
             # Quick Actions
             gr.Markdown("### ⚡ Quick Actions")
             with gr.Row():
-                quick_test = gr.Button("🎯 Test Voice", size="sm", elem_classes="secondary-button")
-                quick_clear = gr.Button("📄 Clear Text", size="sm", elem_classes="secondary-button")
         # Right Panel - Output Display
         with gr.Column(scale=1, elem_classes="glass-card"):
@@ -501,19 +459,13 @@ with gr.Blocks(
             with gr.Column(elem_classes="audio-player"):
                 audio_output = gr.Audio(
                     label="",
-                    type="filepath",
-                    elem_id="audio_output"
                 )
                 # Status and Info
                 status_display = gr.HTML(
                     value="<div style='text-align: center; color: rgba(255,255,255,0.7);'>Ready to generate speech...</div>"
                 )
-            # Download and Share
-            with gr.Row():
-                download_btn = gr.Button("💾 Download Audio", elem_classes="secondary-button")
-                copy_btn = gr.Button("📋 Copy Text", elem_classes="secondary-button")
     # Bottom Section - Stats and Examples
     with gr.Column(elem_classes="glass-card"):
@@ -522,93 +474,68 @@ with gr.Blocks(
                 stats_display = gr.HTML(
                     value=update_stats_display()
                 )
-                refresh_stats = gr.Button("🔄 Refresh Stats", size="sm", elem_classes="secondary-button")
             with gr.TabItem("💡 Examples"):
                 gr.Examples(
                     examples=[
-                        ["Welcome to the future of text-to-speech technology! This is VibeVoice Pro."],
-                        ["In a world where AI transforms everything, voice synthesis stands at the forefront."],
-                        ["The quick brown fox jumps over the lazy dog. This tests all English phonemes."],
-                        ["Imagine a world where every written word can be heard in beautiful, human-like voice."],
-                        ["This is not just text-to-speech. This is emotion and expression in every syllable."]
                     ],
                     inputs=text_input,
-                    label="Click any example to try it",
-                    examples_per_page=5
                 )
-            with gr.TabItem("⚙️ Settings & Info"):
-                gr.Markdown("### About VibeVoice Pro")
                 gr.Markdown("""
-                **VibeVoice Pro** uses Microsoft's state-of-the-art VibeVoice model for high-quality speech synthesis.
                 ### Features:
-                - 🎵 **High Quality**: Professional-grade speech synthesis
-                - ⚡ **Real-time**: Fast generation with GPU acceleration
-                - 🎭 **Emotional Control**: Multiple voice emotions
-                - 🎚️ **Customizable**: Adjustable speed and parameters
-                ### Technical Info:
-                - **Model**: VibeVoice-Realtime-0.5B
-                - **Max Input**: 1000 characters
-                - **Audio Quality**: 16kHz, 32-bit float
-                - **Languages**: English (optimized)
-                ⚠️ **Note**: For best results, keep text under 500 characters.
                 """)
     # Footer
     gr.HTML("""
-    <div style="text-align: center; margin-top: 2rem; padding: 1.5rem; background: rgba(255,255,255,0.05); border-radius: 15px;">
-        <div style="display: flex; justify-content: center; gap: 2rem; margin-bottom: 1rem; flex-wrap: wrap;">
-            <span style="color: rgba(255,255,255,0.7);">📖 Powered by Transformers</span>
-            <span style="color: rgba(255,255,255,0.7);">🎵 Microsoft VibeVoice</span>
-            <span style="color: rgba(255,255,255,0.7);">✨ Gradio Interface</span>
-        </div>
         <p style="color: rgba(255,255,255,0.5); font-size: 0.9em;">
-            Made with ❤️ |
-            <span id="live-time" style="color: #667eea; font-weight: 600;"></span>
         </p>
     </div>
-    <script>
-        function updateTime() {
-            const now = new Date();
-            const timeString = now.toLocaleTimeString();
-            document.getElementById('live-time').textContent = timeString;
-        }
-        setInterval(updateTime, 1000);
-        updateTime();
-    </script>
     """)
     # Event Handlers
     def process_generation(text, emotion_val, speed_val):
         """Handle speech generation"""
         if not text or text.strip() == "":
-            return None, "<div style='color: #ff6b6b; text-align: center;'>⚠️ Please enter some text first!</div>", update_stats_display()
-        # Show processing message
-        yield None, "<div style='color: #667eea; text-align: center;'>⏳ Generating speech... Please wait.</div>", update_stats_display()
-        # Generate speech
         audio_path, status_msg = generate_speech(text, speed_val, emotion_val)
-        # Update stats
         stats_html = update_stats_display()
         return audio_path, status_msg, stats_html
     def clear_all():
-        return "", None, "<div style='color: rgba(255,255,255,0.7); text-align: center;'>Cleared. Ready for new input.</div>", update_stats_display()
     def test_voice():
-        test_text = "This is a test of the VibeVoice Pro text-to-speech system. How amazing is this technology?"
         return test_text
-    def copy_text():
-        return gr.Info("Text copied to clipboard!")
     # Connect buttons
     generate_btn.click(
         fn=process_generation,
@@ -640,23 +567,17 @@ with gr.Blocks(
         outputs=[stats_display]
     )
-    copy_btn.click(
-        fn=copy_text,
-        inputs=[],
-        outputs=[]
-    )
-    # Initialize
     demo.load(
-        fn=lambda: (update_stats_display(), gr.Info("VibeVoice Pro is ready! Enter text and click Generate Speech.")),
         inputs=[],
         outputs=[stats_display]
     )
 if __name__ == "__main__":
     demo.launch(
         debug=True,
         share=False,
-        server_name="0.0.0.0",
-        server_port=7860
     )

 import warnings
 warnings.filterwarnings("ignore")
 # Custom CSS for beautiful UI
 custom_css = """
 .gradio-container {
     border: 2px solid rgba(255, 255, 255, 0.1) !important;
 }
 .custom-slider .gr-slider {
     background: rgba(255, 255, 255, 0.1) !important;
     height: 8px !important;
 }
 """
+# Global variable for model (simple caching)
+_tts_model = None
+_tts_processor = None
 def load_model():
+    """Load the TTS model once"""
+    global _tts_model, _tts_processor
+    if _tts_model is None:
+        print("🚀 Loading VibeVoice model...")
+        try:
+            # Try using pipeline first
+            from transformers import pipeline
+            _tts_model = pipeline(
                 "text-to-speech",
                 model="microsoft/VibeVoice-Realtime-0.5B",
                 device=0 if torch.cuda.is_available() else -1
             )
             print("✅ Model loaded successfully using pipeline!")
+        except Exception as e:
+            print(f"⚠️ Pipeline loading failed: {e}")
+            print("⚠️ Falling back to simple tone generation")
+            _tts_model = "simple"
+    return _tts_model
 # Stats tracking
 class TTSStats:
 stats = TTSStats()
+def generate_simple_tone(text, sampling_rate=16000):
+    """Generate a simple tone for fallback"""
+    # Create tone based on text
+    duration = min(len(text) * 0.05, 5)  # Up to 5 seconds
+    t = np.linspace(0, duration, int(sampling_rate * duration))
+    # Generate tone with varying frequency based on text
+    base_freq = 220 + (hash(text) % 200)  # Vary frequency
+    audio = 0.5 * np.sin(2 * np.pi * base_freq * t)
+    # Add harmonics
+    audio += 0.2 * np.sin(2 * np.pi * base_freq * 2 * t)
+    audio += 0.1 * np.sin(2 * np.pi * base_freq * 3 * t)
+    # Envelope to make it sound more natural
+    envelope = np.exp(-2 * t) * (1 - np.exp(-10 * t))
+    audio *= envelope
+    return audio, sampling_rate
 def generate_speech(text, speed=1.0, emotion="neutral"):
+    """Generate speech from text"""
     try:
         if not text or text.strip() == "":
             return None, "Please enter some text to convert to speech."
         if len(text) > 1000:
             text = text[:1000]
         # Update stats
         stats.add_generation(text)
+        # Load model
+        model = load_model()
+        if model == "simple":
+            # Use simple tone generation
+            audio, sampling_rate = generate_simple_tone(text)
+            message = f"⚠️ Using simple tone generation (model not available)<br>Text: {text[:50]}..."
         else:
+            # Use transformer pipeline
+            print(f"Generating speech for: {text[:50]}...")
+            result = model(text)
             audio = result["audio"]
             sampling_rate = result["sampling_rate"]
+            # Format message based on emotion
+            emotion_icons = {
+                "neutral": "😐",
+                "happy": "😊",
+                "excited": "🎉",
+                "calm": "😌",
+                "professional": "💼"
+            }
+            icon = emotion_icons.get(emotion, "🎵")
+            message = f"{icon} Generated {len(text)} characters with {emotion} tone"
         # Normalize audio
         audio = audio / np.max(np.abs(audio)) * 0.95
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             scipy.io.wavfile.write(tmp_file.name, sampling_rate, audio.astype(np.float32))
+            success_message = f"""
             <div style='background: rgba(102, 126, 234, 0.1); padding: 1rem; border-radius: 10px; border-left: 4px solid #667eea;'>
+                <div style='color: #667eea; font-weight: 600; margin-bottom: 0.5rem;'>✅ {message}</div>
                 <div style='color: rgba(255,255,255,0.8);'>
+                    Length: <strong>{len(audio)/sampling_rate:.1f}s</strong> |
+                    Speed: <strong>{speed}x</strong> |
+                    Emotion: <strong>{emotion}</strong>
                 </div>
             </div>
             """
+            return tmp_file.name, success_message
     except Exception as e:
         print(f"Error generating speech: {e}")
+        # Create silent audio as fallback
         try:
             import scipy.io.wavfile
             silent_audio = np.zeros(16000, dtype=np.float32)
             with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
                 scipy.io.wavfile.write(tmp_file.name, 16000, silent_audio)
+                return tmp_file.name, f"❌ Error: {str(e)[:100]}"
         except:
+            return None, f"❌ Error: {str(e)[:100]}"
 def update_stats_display():
     """Update the statistics display"""
 # Create the interface
 with gr.Blocks(
+    title="🎵 VibeVoice TTS",
     css=custom_css
 ) as demo:
     with gr.Column(elem_classes="header"):
         gr.HTML("""
         <div style="text-align: center;">
+            <h1>🎵 VibeVoice TTS</h1>
+            <p style="font-size: 1.2em; opacity: 0.9;">Transform Text into Speech</p>
             <div style="display: flex; justify-content: center; gap: 0.5rem; margin-top: 1rem; flex-wrap: wrap;">
                 <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
+                    🤖 AI Powered
                 </span>
                 <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
+                    ⚡ Real-time
                 </span>
                 <span style="background: rgba(255,255,255,0.2); padding: 0.5rem 1rem; border-radius: 20px; font-size: 0.9em;">
+                    🎭 Emotional Voices
                 </span>
             </div>
         </div>
     with gr.Row():
         # Left Panel - Input Controls
         with gr.Column(scale=1, elem_classes="glass-card"):
+            gr.Markdown("### 📝 Input Text")
             text_input = gr.Textbox(
                 label="",
+                placeholder="Enter your text here... (Max 1000 characters)",
                 lines=6,
                 elem_classes="fancy-textbox"
             )
                 emotion = gr.Dropdown(
                     label="Voice Emotion",
                     choices=["neutral", "happy", "excited", "calm", "professional"],
+                    value="neutral"
                 )
             with gr.Row():
                     maximum=2.0,
                     value=1.0,
                     step=0.1,
+                    label="Speaking Speed",
                     elem_classes="custom-slider"
                 )
                 generate_btn = gr.Button(
                     "✨ Generate Speech",
                     variant="primary",
+                    elem_classes="glow-button"
                 )
                 clear_btn = gr.Button(
+                    "Clear",
                     variant="secondary",
                     elem_classes="secondary-button"
                 )
             # Quick Actions
             gr.Markdown("### ⚡ Quick Actions")
             with gr.Row():
+                quick_test = gr.Button("Test Voice", elem_classes="secondary-button")
+                quick_clear = gr.Button("Clear Text", elem_classes="secondary-button")
         # Right Panel - Output Display
         with gr.Column(scale=1, elem_classes="glass-card"):
             with gr.Column(elem_classes="audio-player"):
                 audio_output = gr.Audio(
                     label="",
+                    type="filepath"
                 )
                 # Status and Info
                 status_display = gr.HTML(
                     value="<div style='text-align: center; color: rgba(255,255,255,0.7);'>Ready to generate speech...</div>"
                 )
     # Bottom Section - Stats and Examples
     with gr.Column(elem_classes="glass-card"):
                 stats_display = gr.HTML(
                     value=update_stats_display()
                 )
+                refresh_stats = gr.Button("Refresh Stats", elem_classes="secondary-button")
             with gr.TabItem("💡 Examples"):
                 gr.Examples(
                     examples=[
+                        ["Hello, welcome to VibeVoice text-to-speech!"],
+                        ["This is a demonstration of AI speech synthesis."],
+                        ["The weather is beautiful today."],
+                        ["Artificial intelligence is amazing technology."],
+                        ["Please enjoy this text to speech demonstration."]
                     ],
                     inputs=text_input,
+                    label="Click any example to try it"
                 )
+            with gr.TabItem("ℹ️ About"):
                 gr.Markdown("""
+                ## About VibeVoice TTS
+                This application converts text into speech using AI technology.
                 ### Features:
+                - **AI-Powered**: Uses advanced machine learning models
+                - **Multiple Emotions**: Choose different voice tones
+                - **Adjustable Speed**: Control speaking rate
+                - **Real-time**: Fast generation
+                ### Tips:
+                - Keep text under 500 characters for best results
+                - Try different emotions for varied expressions
+                - Adjust speed to match your preference
+                ⚠️ **Note**: If the model fails to load, a simple tone generator will be used as fallback.
                 """)
     # Footer
     gr.HTML("""
+    <div style="text-align: center; margin-top: 2rem; padding: 1rem; background: rgba(255,255,255,0.05); border-radius: 15px;">
         <p style="color: rgba(255,255,255,0.5); font-size: 0.9em;">
+            Made with ❤️ using Gradio & Transformers
         </p>
     </div>
     """)
     # Event Handlers
     def process_generation(text, emotion_val, speed_val):
         """Handle speech generation"""
         if not text or text.strip() == "":
+            return None, "⚠️ Please enter some text first!", update_stats_display()
         audio_path, status_msg = generate_speech(text, speed_val, emotion_val)
         stats_html = update_stats_display()
         return audio_path, status_msg, stats_html
     def clear_all():
+        return "", None, "Cleared. Ready for new input.", update_stats_display()
     def test_voice():
+        test_text = "Hello! This is a test of the VibeVoice text-to-speech system."
         return test_text
     # Connect buttons
     generate_btn.click(
         fn=process_generation,
         outputs=[stats_display]
     )
+    # Initialize stats on load
     demo.load(
+        fn=update_stats_display,
         inputs=[],
         outputs=[stats_display]
     )
+# Launch the app
 if __name__ == "__main__":
     demo.launch(
         debug=True,
         share=False,
+        server_name="0.0.0.0"
     )