Spaces:

DevNumb
/

TextTOVoiceConv

Sleeping

File size: 12,017 Bytes

import gradio as gr
import torch
import numpy as np
import tempfile
import time
import warnings
warnings.filterwarnings("ignore")

# HTML with inline CSS for white background and black text
html_with_css = """
<!DOCTYPE html>
<html>
<head>
<style>
body, .gradio-container {
    background: white !important;
    color: #333333 !important;
    font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
    margin: 0;
    padding: 20px;
}

.header {
    text-align: center;
    padding: 2rem;
    background: linear-gradient(135deg, #4F46E5 0%, #7C3AED 100%);
    border-radius: 16px;
    margin-bottom: 2rem;
    color: white;
}

.header h1 {
    font-size: 2.5em;
    margin: 0 0 0.5rem 0;
    font-weight: 700;
}

/* BLACK TEXT ON WHITE - MOST IMPORTANT */
textarea {
    background: white !important;
    border: 2px solid #4F46E5 !important;
    border-radius: 12px !important;
    color: #000000 !important; /* Pure black text */
    padding: 1rem !important;
    font-size: 16px !important;
    width: 100% !important;
    min-height: 120px !important;
    font-family: monospace !important;
}

textarea::placeholder {
    color: #666666 !important;
}

button {
    padding: 0.75rem 1.5rem !important;
    border-radius: 10px !important;
    font-weight: 600 !important;
    margin: 0.5rem !important;
    cursor: pointer !important;
}

.primary-btn {
    background: linear-gradient(135deg, #4F46E5 0%, #7C3AED 100%) !important;
    border: none !important;
    color: white !important;
}

.secondary-btn {
    background: white !important;
    border: 2px solid #D1D5DB !important;
    color: #374151 !important;
}

.card {
    background: white;
    border: 1px solid #E5E7EB;
    border-radius: 12px;
    padding: 1.5rem;
    margin-bottom: 1rem;
}

.status-success {
    background: #DCFCE7;
    border: 1px solid #86EFAC;
    border-left: 4px solid #10B981;
    color: #065F46;
    padding: 1rem;
    border-radius: 8px;
    margin: 1rem 0;
}

.status-info {
    background: #DBEAFE;
    border: 1px solid #93C5FD;
    border-left: 4px solid #3B82F6;
    color: #1E40AF;
    padding: 1rem;
    border-radius: 8px;
    margin: 1rem 0;
}
</style>
</head>
<body>
<div class="header">
    <h1>🎵 Text-to-Speech</h1>
    <p>Convert text to speech with smaller AI model</p>
</div>
</body>
</html>
"""

print("🚀 Starting TTS System...")

# Try to load a SMALLER TTS model that fits in free tier
def load_small_tts_model():
    """Load a smaller TTS model that fits in Hugging Face Spaces free tier"""
    try:
        print("📥 Loading smaller TTS model...")
        
        # Option 1: Try Coqui TTS (smaller footprint)
        try:
            from TTS.api import TTS
            # Using a small multilingual model
            tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False)
            print("✅ Loaded Coqui XTTS model")
            return ("coqui", tts_model)
        except ImportError:
            print("  Coqui TTS not available")
        
        # Option 2: Try SpeechT5 (smaller than VibeVoice)
        try:
            from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
            import torch
            
            processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
            model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
            vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
            
            # Use CPU to save memory
            model = model.to("cpu")
            vocoder = vocoder.to("cpu")
            
            print("✅ Loaded SpeechT5 model (CPU)")
            return ("speecht5", {"processor": processor, "model": model, "vocoder": vocoder})
        except Exception as e:
            print(f"  SpeechT5 failed: {e}")
        
        # Option 3: Try Bark (small and fast)
        try:
            from transformers import AutoProcessor, BarkModel
            import torch
            
            processor = AutoProcessor.from_pretrained("suno/bark-small")
            model = BarkModel.from_pretrained("suno/bark-small")
            
            # Use CPU
            model = model.to("cpu")
            
            print("✅ Loaded Bark model (CPU)")
            return ("bark", {"processor": processor, "model": model})
        except Exception as e:
            print(f"  Bark failed: {e}")
        
        print("⚠️ No small TTS model loaded, using gTTS fallback")
        return ("gtts", None)
        
    except Exception as e:
        print(f"❌ Error loading models: {e}")
        return ("gtts", None)

# Load model
model_type, tts_model = load_small_tts_model()

def generate_with_model(text, speed=1.0):
    """Generate speech using the loaded model"""
    try:
        if not text or not text.strip():
            return None, None
        
        print(f"🔊 Generating: {text[:50]}...")
        
        if model_type == "coqui" and tts_model:
            # Coqui TTS
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
                tts_model.tts_to_file(text=text, file_path=f.name)
                return f.name, 24000
        
        elif model_type == "speecht5" and tts_model:
            # SpeechT5
            processor = tts_model["processor"]
            model = tts_model["model"]
            vocoder = tts_model["vocoder"]
            
            inputs = processor(text=text, return_tensors="pt")
            
            with torch.no_grad():
                speech = model.generate_speech(inputs["input_ids"], vocoder=vocoder)
            
            audio = speech.numpy()
            
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
                import scipy.io.wavfile
                scipy.io.wavfile.write(f.name, 16000, audio.astype(np.float32))
                return f.name, 16000
        
        elif model_type == "bark" and tts_model:
            # Bark
            processor = tts_model["processor"]
            model = tts_model["model"]
            
            inputs = processor(text, return_tensors="pt")
            
            with torch.no_grad():
                audio_array = model.generate(**inputs)
                audio_array = audio_array.cpu().numpy().squeeze()
            
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
                import scipy.io.wavfile
                scipy.io.wavfile.write(f.name, 24000, audio_array.astype(np.float32))
                return f.name, 24000
        
        return None, None
        
    except Exception as e:
        print(f"❌ Model generation error: {e}")
        return None, None

def generate_with_gtts(text):
    """Fallback to gTTS (requires internet but works well)"""
    try:
        from gtts import gTTS
        
        with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
            tts = gTTS(text=text, lang='en', slow=False)
            tts.save(f.name)
            return f.name, "gTTS"
    except Exception as e:
        print(f"❌ gTTS error: {e}")
        return None, None

def create_basic_audio(text):
    """Create basic audio as last resort"""
    import scipy.io.wavfile
    
    duration = min(len(text) * 0.05, 5)
    sr = 24000
    t = np.linspace(0, duration, int(sr * duration))
    
    # Create varied audio
    base_freq = 220
    audio = np.zeros_like(t)
    
    for i, char in enumerate(text[:20]):
        freq = base_freq + (ord(char) % 300)
        amp = 0.3 / (i + 1)
        audio += amp * np.sin(2 * np.pi * freq * t)
    
    envelope = np.exp(-2 * t) * (1 - np.exp(-8 * t))
    audio *= envelope
    
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
        scipy.io.wavfile.write(f.name, sr, audio.astype(np.float32))
        return f.name, "Basic"

# Create the interface
with gr.Blocks() as demo:
    # Add CSS as HTML
    gr.HTML(html_with_css)
    
    # Main layout
    with gr.Row():
        # Input column
        with gr.Column(scale=2):
            gr.Markdown("### 📝 Enter Text")
            text_input = gr.Textbox(
                label="",
                placeholder="Type your text here... (Black text on white background)",
                lines=5
            )
            
            with gr.Row():
                speed = gr.Slider(
                    minimum=0.5,
                    maximum=2.0,
                    value=1.0,
                    step=0.1,
                    label="Speed"
                )
            
            with gr.Row():
                generate_btn = gr.Button("✨ Generate Speech", variant="primary")
                clear_btn = gr.Button("Clear", variant="secondary")
        
        # Output column
        with gr.Column(scale=1):
            gr.Markdown("### 🎧 Audio Output")
            audio_output = gr.Audio(type="filepath", label="")
            status = gr.HTML("""
            <div class="status-info">
                <strong>Ready</strong><br>
                Enter text and click Generate Speech
            </div>
            """)
    
    # Model info
    gr.Markdown("### ℹ️ System Information")
    
    if model_type == "coqui":
        gr.Markdown("✅ **Model**: Coqui XTTS (Multilingual)")
    elif model_type == "speecht5":
        gr.Markdown("✅ **Model**: Microsoft SpeechT5")
    elif model_type == "bark":
        gr.Markdown("✅ **Model**: Suno Bark")
    elif model_type == "gtts":
        gr.Markdown("⚠️ **Model**: gTTS (Fallback - requires internet)")
    else:
        gr.Markdown("⚠️ **Model**: Basic audio generation")
    
    # Examples
    gr.Markdown("### 💡 Examples")
    gr.Examples(
        examples=[
            ["Hello! Welcome to the text-to-speech system."],
            ["This is a demonstration of AI speech synthesis."],
            ["The quick brown fox jumps over the lazy dog."],
            ["Artificial intelligence is transforming technology."]
        ],
        inputs=text_input,
        label="Click to try:"
    )
    
    # Event handlers
    def process_text(text, speed_val):
        if not text or not text.strip():
            return None, """
            <div class="status-info">
                <strong>⚠️ Please enter text</strong><br>
                Type something in the text box above
            </div>
            """
        
        print(f"Processing: {text[:50]}...")
        
        # Try model first
        audio_file, sr = generate_with_model(text, speed_val)
        source = "AI Model"
        
        # Fallback to gTTS
        if audio_file is None:
            audio_file, source = generate_with_gtts(text)
        
        # Last resort: basic audio
        if audio_file is None:
            audio_file, source = create_basic_audio(text)
        
        if audio_file:
            message = f"""
            <div class="status-success">
                <strong>✅ Speech Generated!</strong><br>
                Source: {source} • Characters: {len(text)}<br>
                Speed: {speed_val}x
            </div>
            """
            return audio_file, message
        else:
            return None, """
            <div class="status-info">
                <strong>❌ Failed to generate</strong><br>
                Please try different text
            </div>
            """
    
    def clear_all():
        return "", None, """
        <div class="status-info">
            <strong>Cleared</strong><br>
            Ready for new text input
        </div>
        """
    
    # Connect buttons
    generate_btn.click(
        process_text,
        [text_input, speed],
        [audio_output, status]
    )
    
    clear_btn.click(
        clear_all,
        [],
        [text_input, audio_output, status]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        show_error=True,
        quiet=True
    )