Spaces:

DevNumb
/

TextTOVoiceConv

Sleeping

App Files Files Community

DevNumb commited on Dec 5, 2025

Commit

afd6946

verified ·

1 Parent(s): 8247c4d

Update app.py

Browse files

Files changed (1) hide show

app.py +359 -56

app.py CHANGED Viewed

@@ -1,99 +1,402 @@
 import gradio as gr
 import tempfile
-import os
 import warnings
 warnings.filterwarnings("ignore")
-# CSS for white background with black text
-css = """
 <style>
-body {
     background: white !important;
     padding: 20px;
 }
 textarea {
     background: white !important;
-    color: black !important;
-    border: 2px solid #4CAF50 !important;
-    border-radius: 10px !important;
-    padding: 15px !important;
     font-size: 16px !important;
     width: 100% !important;
 }
 button {
-    background: #4CAF50 !important;
-    color: white !important;
     border: none !important;
-    padding: 10px 20px !important;
-    border-radius: 5px !important;
 }
 </style>
 """
-def text_to_speech_actual(text):
-    """Use actual TTS engine"""
-    if not text:
-        return None
     try:
-        # Try using gTTS (Google Text-to-Speech) - works well and is free
-        from gtts import gTTS
-        import pygame
-        # Create temporary file
-        with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
-            temp_file = f.name
-        # Generate speech
-        tts = gTTS(text=text, lang='en', slow=False)
-        tts.save(temp_file)
-        return temp_file
     except Exception as e:
-        print(f"TTS Error: {e}")
-        return None
-# Simple interface
-with gr.Blocks(css=css) as demo:
-    gr.Markdown("# 🎵 Actual Text-to-Speech")
-    gr.Markdown("This uses real TTS to convert text to speech")
-    text_input = gr.Textbox(
-        label="Enter Text",
-        placeholder="Type your text here...",
-        lines=4
-    )
     with gr.Row():
-        generate_btn = gr.Button("Generate Speech")
-        clear_btn = gr.Button("Clear")
-    audio_output = gr.Audio(type="filepath", label="Speech Output")
-    status = gr.Markdown("Ready...")
     gr.Examples(
         examples=[
-            ["Hello! This is actual text-to-speech conversion."],
-            ["Welcome to the speech synthesis system."],
-            ["The quick brown fox jumps over the lazy dog."]
         ],
-        inputs=text_input
     )
-    def process(text):
-        audio = text_to_speech_actual(text)
-        if audio:
-            return audio, "✅ Speech generated successfully!"
-        return None, "❌ Failed to generate speech"
-    def clear():
-        return "", None, "Cleared"
-    generate_btn.click(process, text_input, [audio_output, status])
-    clear_btn.click(clear, [], [text_input, audio_output, status])
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+import torch
+import numpy as np
 import tempfile
+import time
 import warnings
 warnings.filterwarnings("ignore")
+# HTML with inline CSS for white background and black text
+html_with_css = """
+<!DOCTYPE html>
+<html>
+<head>
 <style>
+body, .gradio-container {
     background: white !important;
+    color: #333333 !important;
+    font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+    margin: 0;
     padding: 20px;
 }
+.header {
+    text-align: center;
+    padding: 2rem;
+    background: linear-gradient(135deg, #4F46E5 0%, #7C3AED 100%);
+    border-radius: 16px;
+    margin-bottom: 2rem;
+    color: white;
+}
+.header h1 {
+    font-size: 2.5em;
+    margin: 0 0 0.5rem 0;
+    font-weight: 700;
+}
+/* BLACK TEXT ON WHITE - MOST IMPORTANT */
 textarea {
     background: white !important;
+    border: 2px solid #4F46E5 !important;
+    border-radius: 12px !important;
+    color: #000000 !important; /* Pure black text */
+    padding: 1rem !important;
     font-size: 16px !important;
     width: 100% !important;
+    min-height: 120px !important;
+    font-family: monospace !important;
+}
+textarea::placeholder {
+    color: #666666 !important;
 }
 button {
+    padding: 0.75rem 1.5rem !important;
+    border-radius: 10px !important;
+    font-weight: 600 !important;
+    margin: 0.5rem !important;
+    cursor: pointer !important;
+}
+.primary-btn {
+    background: linear-gradient(135deg, #4F46E5 0%, #7C3AED 100%) !important;
     border: none !important;
+    color: white !important;
+}
+.secondary-btn {
+    background: white !important;
+    border: 2px solid #D1D5DB !important;
+    color: #374151 !important;
+}
+.card {
+    background: white;
+    border: 1px solid #E5E7EB;
+    border-radius: 12px;
+    padding: 1.5rem;
+    margin-bottom: 1rem;
+}
+.status-success {
+    background: #DCFCE7;
+    border: 1px solid #86EFAC;
+    border-left: 4px solid #10B981;
+    color: #065F46;
+    padding: 1rem;
+    border-radius: 8px;
+    margin: 1rem 0;
+}
+.status-info {
+    background: #DBEAFE;
+    border: 1px solid #93C5FD;
+    border-left: 4px solid #3B82F6;
+    color: #1E40AF;
+    padding: 1rem;
+    border-radius: 8px;
+    margin: 1rem 0;
 }
 </style>
+</head>
+<body>
+<div class="header">
+    <h1>🎵 Text-to-Speech</h1>
+    <p>Convert text to speech with smaller AI model</p>
+</div>
+</body>
+</html>
 """
+print("🚀 Starting TTS System...")
+# Try to load a SMALLER TTS model that fits in free tier
+def load_small_tts_model():
+    """Load a smaller TTS model that fits in Hugging Face Spaces free tier"""
     try:
+        print("📥 Loading smaller TTS model...")
+        # Option 1: Try Coqui TTS (smaller footprint)
+        try:
+            from TTS.api import TTS
+            # Using a small multilingual model
+            tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False)
+            print("✅ Loaded Coqui XTTS model")
+            return ("coqui", tts_model)
+        except ImportError:
+            print("  Coqui TTS not available")
+        # Option 2: Try SpeechT5 (smaller than VibeVoice)
+        try:
+            from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+            import torch
+            processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+            model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
+            vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+            # Use CPU to save memory
+            model = model.to("cpu")
+            vocoder = vocoder.to("cpu")
+            print("✅ Loaded SpeechT5 model (CPU)")
+            return ("speecht5", {"processor": processor, "model": model, "vocoder": vocoder})
+        except Exception as e:
+            print(f"  SpeechT5 failed: {e}")
+        # Option 3: Try Bark (small and fast)
+        try:
+            from transformers import AutoProcessor, BarkModel
+            import torch
+            processor = AutoProcessor.from_pretrained("suno/bark-small")
+            model = BarkModel.from_pretrained("suno/bark-small")
+            # Use CPU
+            model = model.to("cpu")
+            print("✅ Loaded Bark model (CPU)")
+            return ("bark", {"processor": processor, "model": model})
+        except Exception as e:
+            print(f"  Bark failed: {e}")
+        print("⚠️ No small TTS model loaded, using gTTS fallback")
+        return ("gtts", None)
+    except Exception as e:
+        print(f"❌ Error loading models: {e}")
+        return ("gtts", None)
+# Load model
+model_type, tts_model = load_small_tts_model()
+def generate_with_model(text, speed=1.0):
+    """Generate speech using the loaded model"""
+    try:
+        if not text or not text.strip():
+            return None, None
+        print(f"🔊 Generating: {text[:50]}...")
+        if model_type == "coqui" and tts_model:
+            # Coqui TTS
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+                tts_model.tts_to_file(text=text, file_path=f.name)
+                return f.name, 24000
+        elif model_type == "speecht5" and tts_model:
+            # SpeechT5
+            processor = tts_model["processor"]
+            model = tts_model["model"]
+            vocoder = tts_model["vocoder"]
+            inputs = processor(text=text, return_tensors="pt")
+            with torch.no_grad():
+                speech = model.generate_speech(inputs["input_ids"], vocoder=vocoder)
+            audio = speech.numpy()
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+                import scipy.io.wavfile
+                scipy.io.wavfile.write(f.name, 16000, audio.astype(np.float32))
+                return f.name, 16000
+        elif model_type == "bark" and tts_model:
+            # Bark
+            processor = tts_model["processor"]
+            model = tts_model["model"]
+            inputs = processor(text, return_tensors="pt")
+            with torch.no_grad():
+                audio_array = model.generate(**inputs)
+                audio_array = audio_array.cpu().numpy().squeeze()
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+                import scipy.io.wavfile
+                scipy.io.wavfile.write(f.name, 24000, audio_array.astype(np.float32))
+                return f.name, 24000
+        return None, None
     except Exception as e:
+        print(f"❌ Model generation error: {e}")
+        return None, None
+def generate_with_gtts(text):
+    """Fallback to gTTS (requires internet but works well)"""
+    try:
+        from gtts import gTTS
+        with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
+            tts = gTTS(text=text, lang='en', slow=False)
+            tts.save(f.name)
+            return f.name, "gTTS"
+    except Exception as e:
+        print(f"❌ gTTS error: {e}")
+        return None, None
+def create_basic_audio(text):
+    """Create basic audio as last resort"""
+    import scipy.io.wavfile
+    duration = min(len(text) * 0.05, 5)
+    sr = 24000
+    t = np.linspace(0, duration, int(sr * duration))
+    # Create varied audio
+    base_freq = 220
+    audio = np.zeros_like(t)
+    for i, char in enumerate(text[:20]):
+        freq = base_freq + (ord(char) % 300)
+        amp = 0.3 / (i + 1)
+        audio += amp * np.sin(2 * np.pi * freq * t)
+    envelope = np.exp(-2 * t) * (1 - np.exp(-8 * t))
+    audio *= envelope
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+        scipy.io.wavfile.write(f.name, sr, audio.astype(np.float32))
+        return f.name, "Basic"
+# Create the interface
+with gr.Blocks() as demo:
+    # Add CSS as HTML
+    gr.HTML(html_with_css)
+    # Main layout
     with gr.Row():
+        # Input column
+        with gr.Column(scale=2):
+            gr.Markdown("### 📝 Enter Text")
+            text_input = gr.Textbox(
+                label="",
+                placeholder="Type your text here... (Black text on white background)",
+                lines=5
+            )
+            with gr.Row():
+                speed = gr.Slider(
+                    minimum=0.5,
+                    maximum=2.0,
+                    value=1.0,
+                    step=0.1,
+                    label="Speed"
+                )
+            with gr.Row():
+                generate_btn = gr.Button("✨ Generate Speech", variant="primary")
+                clear_btn = gr.Button("Clear", variant="secondary")
+        # Output column
+        with gr.Column(scale=1):
+            gr.Markdown("### 🎧 Audio Output")
+            audio_output = gr.Audio(type="filepath", label="")
+            status = gr.HTML("""
+            <div class="status-info">
+                <strong>Ready</strong><br>
+                Enter text and click Generate Speech
+            </div>
+            """)
+    # Model info
+    gr.Markdown("### ℹ️ System Information")
+    if model_type == "coqui":
+        gr.Markdown("✅ **Model**: Coqui XTTS (Multilingual)")
+    elif model_type == "speecht5":
+        gr.Markdown("✅ **Model**: Microsoft SpeechT5")
+    elif model_type == "bark":
+        gr.Markdown("✅ **Model**: Suno Bark")
+    elif model_type == "gtts":
+        gr.Markdown("⚠️ **Model**: gTTS (Fallback - requires internet)")
+    else:
+        gr.Markdown("⚠️ **Model**: Basic audio generation")
+    # Examples
+    gr.Markdown("### 💡 Examples")
     gr.Examples(
         examples=[
+            ["Hello! Welcome to the text-to-speech system."],
+            ["This is a demonstration of AI speech synthesis."],
+            ["The quick brown fox jumps over the lazy dog."],
+            ["Artificial intelligence is transforming technology."]
         ],
+        inputs=text_input,
+        label="Click to try:"
     )
+    # Event handlers
+    def process_text(text, speed_val):
+        if not text or not text.strip():
+            return None, """
+            <div class="status-info">
+                <strong>⚠️ Please enter text</strong><br>
+                Type something in the text box above
+            </div>
+            """
+        print(f"Processing: {text[:50]}...")
+        # Try model first
+        audio_file, sr = generate_with_model(text, speed_val)
+        source = "AI Model"
+        # Fallback to gTTS
+        if audio_file is None:
+            audio_file, source = generate_with_gtts(text)
+        # Last resort: basic audio
+        if audio_file is None:
+            audio_file, source = create_basic_audio(text)
+        if audio_file:
+            message = f"""
+            <div class="status-success">
+                <strong>✅ Speech Generated!</strong><br>
+                Source: {source} • Characters: {len(text)}<br>
+                Speed: {speed_val}x
+            </div>
+            """
+            return audio_file, message
+        else:
+            return None, """
+            <div class="status-info">
+                <strong>❌ Failed to generate</strong><br>
+                Please try different text
+            </div>
+            """
+    def clear_all():
+        return "", None, """
+        <div class="status-info">
+            <strong>Cleared</strong><br>
+            Ready for new text input
+        </div>
+        """
+    # Connect buttons
+    generate_btn.click(
+        process_text,
+        [text_input, speed],
+        [audio_output, status]
+    )
+    clear_btn.click(
+        clear_all,
+        [],
+        [text_input, audio_output, status]
+    )
+# Launch the app
 if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True,
+        quiet=True
+    )