TextTOVoiceConv / app.py
DevNumb's picture
Update app.py
afd6946 verified
import gradio as gr
import torch
import numpy as np
import tempfile
import time
import warnings
warnings.filterwarnings("ignore")
# HTML with inline CSS for white background and black text
html_with_css = """
<!DOCTYPE html>
<html>
<head>
<style>
body, .gradio-container {
background: white !important;
color: #333333 !important;
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
margin: 0;
padding: 20px;
}
.header {
text-align: center;
padding: 2rem;
background: linear-gradient(135deg, #4F46E5 0%, #7C3AED 100%);
border-radius: 16px;
margin-bottom: 2rem;
color: white;
}
.header h1 {
font-size: 2.5em;
margin: 0 0 0.5rem 0;
font-weight: 700;
}
/* BLACK TEXT ON WHITE - MOST IMPORTANT */
textarea {
background: white !important;
border: 2px solid #4F46E5 !important;
border-radius: 12px !important;
color: #000000 !important; /* Pure black text */
padding: 1rem !important;
font-size: 16px !important;
width: 100% !important;
min-height: 120px !important;
font-family: monospace !important;
}
textarea::placeholder {
color: #666666 !important;
}
button {
padding: 0.75rem 1.5rem !important;
border-radius: 10px !important;
font-weight: 600 !important;
margin: 0.5rem !important;
cursor: pointer !important;
}
.primary-btn {
background: linear-gradient(135deg, #4F46E5 0%, #7C3AED 100%) !important;
border: none !important;
color: white !important;
}
.secondary-btn {
background: white !important;
border: 2px solid #D1D5DB !important;
color: #374151 !important;
}
.card {
background: white;
border: 1px solid #E5E7EB;
border-radius: 12px;
padding: 1.5rem;
margin-bottom: 1rem;
}
.status-success {
background: #DCFCE7;
border: 1px solid #86EFAC;
border-left: 4px solid #10B981;
color: #065F46;
padding: 1rem;
border-radius: 8px;
margin: 1rem 0;
}
.status-info {
background: #DBEAFE;
border: 1px solid #93C5FD;
border-left: 4px solid #3B82F6;
color: #1E40AF;
padding: 1rem;
border-radius: 8px;
margin: 1rem 0;
}
</style>
</head>
<body>
<div class="header">
<h1>🎡 Text-to-Speech</h1>
<p>Convert text to speech with smaller AI model</p>
</div>
</body>
</html>
"""
print("πŸš€ Starting TTS System...")
# Try to load a SMALLER TTS model that fits in free tier
def load_small_tts_model():
"""Load a smaller TTS model that fits in Hugging Face Spaces free tier"""
try:
print("πŸ“₯ Loading smaller TTS model...")
# Option 1: Try Coqui TTS (smaller footprint)
try:
from TTS.api import TTS
# Using a small multilingual model
tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False)
print("βœ… Loaded Coqui XTTS model")
return ("coqui", tts_model)
except ImportError:
print(" Coqui TTS not available")
# Option 2: Try SpeechT5 (smaller than VibeVoice)
try:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import torch
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# Use CPU to save memory
model = model.to("cpu")
vocoder = vocoder.to("cpu")
print("βœ… Loaded SpeechT5 model (CPU)")
return ("speecht5", {"processor": processor, "model": model, "vocoder": vocoder})
except Exception as e:
print(f" SpeechT5 failed: {e}")
# Option 3: Try Bark (small and fast)
try:
from transformers import AutoProcessor, BarkModel
import torch
processor = AutoProcessor.from_pretrained("suno/bark-small")
model = BarkModel.from_pretrained("suno/bark-small")
# Use CPU
model = model.to("cpu")
print("βœ… Loaded Bark model (CPU)")
return ("bark", {"processor": processor, "model": model})
except Exception as e:
print(f" Bark failed: {e}")
print("⚠️ No small TTS model loaded, using gTTS fallback")
return ("gtts", None)
except Exception as e:
print(f"❌ Error loading models: {e}")
return ("gtts", None)
# Load model
model_type, tts_model = load_small_tts_model()
def generate_with_model(text, speed=1.0):
"""Generate speech using the loaded model"""
try:
if not text or not text.strip():
return None, None
print(f"πŸ”Š Generating: {text[:50]}...")
if model_type == "coqui" and tts_model:
# Coqui TTS
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
tts_model.tts_to_file(text=text, file_path=f.name)
return f.name, 24000
elif model_type == "speecht5" and tts_model:
# SpeechT5
processor = tts_model["processor"]
model = tts_model["model"]
vocoder = tts_model["vocoder"]
inputs = processor(text=text, return_tensors="pt")
with torch.no_grad():
speech = model.generate_speech(inputs["input_ids"], vocoder=vocoder)
audio = speech.numpy()
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
import scipy.io.wavfile
scipy.io.wavfile.write(f.name, 16000, audio.astype(np.float32))
return f.name, 16000
elif model_type == "bark" and tts_model:
# Bark
processor = tts_model["processor"]
model = tts_model["model"]
inputs = processor(text, return_tensors="pt")
with torch.no_grad():
audio_array = model.generate(**inputs)
audio_array = audio_array.cpu().numpy().squeeze()
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
import scipy.io.wavfile
scipy.io.wavfile.write(f.name, 24000, audio_array.astype(np.float32))
return f.name, 24000
return None, None
except Exception as e:
print(f"❌ Model generation error: {e}")
return None, None
def generate_with_gtts(text):
"""Fallback to gTTS (requires internet but works well)"""
try:
from gtts import gTTS
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
tts = gTTS(text=text, lang='en', slow=False)
tts.save(f.name)
return f.name, "gTTS"
except Exception as e:
print(f"❌ gTTS error: {e}")
return None, None
def create_basic_audio(text):
"""Create basic audio as last resort"""
import scipy.io.wavfile
duration = min(len(text) * 0.05, 5)
sr = 24000
t = np.linspace(0, duration, int(sr * duration))
# Create varied audio
base_freq = 220
audio = np.zeros_like(t)
for i, char in enumerate(text[:20]):
freq = base_freq + (ord(char) % 300)
amp = 0.3 / (i + 1)
audio += amp * np.sin(2 * np.pi * freq * t)
envelope = np.exp(-2 * t) * (1 - np.exp(-8 * t))
audio *= envelope
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
scipy.io.wavfile.write(f.name, sr, audio.astype(np.float32))
return f.name, "Basic"
# Create the interface
with gr.Blocks() as demo:
# Add CSS as HTML
gr.HTML(html_with_css)
# Main layout
with gr.Row():
# Input column
with gr.Column(scale=2):
gr.Markdown("### πŸ“ Enter Text")
text_input = gr.Textbox(
label="",
placeholder="Type your text here... (Black text on white background)",
lines=5
)
with gr.Row():
speed = gr.Slider(
minimum=0.5,
maximum=2.0,
value=1.0,
step=0.1,
label="Speed"
)
with gr.Row():
generate_btn = gr.Button("✨ Generate Speech", variant="primary")
clear_btn = gr.Button("Clear", variant="secondary")
# Output column
with gr.Column(scale=1):
gr.Markdown("### 🎧 Audio Output")
audio_output = gr.Audio(type="filepath", label="")
status = gr.HTML("""
<div class="status-info">
<strong>Ready</strong><br>
Enter text and click Generate Speech
</div>
""")
# Model info
gr.Markdown("### ℹ️ System Information")
if model_type == "coqui":
gr.Markdown("βœ… **Model**: Coqui XTTS (Multilingual)")
elif model_type == "speecht5":
gr.Markdown("βœ… **Model**: Microsoft SpeechT5")
elif model_type == "bark":
gr.Markdown("βœ… **Model**: Suno Bark")
elif model_type == "gtts":
gr.Markdown("⚠️ **Model**: gTTS (Fallback - requires internet)")
else:
gr.Markdown("⚠️ **Model**: Basic audio generation")
# Examples
gr.Markdown("### πŸ’‘ Examples")
gr.Examples(
examples=[
["Hello! Welcome to the text-to-speech system."],
["This is a demonstration of AI speech synthesis."],
["The quick brown fox jumps over the lazy dog."],
["Artificial intelligence is transforming technology."]
],
inputs=text_input,
label="Click to try:"
)
# Event handlers
def process_text(text, speed_val):
if not text or not text.strip():
return None, """
<div class="status-info">
<strong>⚠️ Please enter text</strong><br>
Type something in the text box above
</div>
"""
print(f"Processing: {text[:50]}...")
# Try model first
audio_file, sr = generate_with_model(text, speed_val)
source = "AI Model"
# Fallback to gTTS
if audio_file is None:
audio_file, source = generate_with_gtts(text)
# Last resort: basic audio
if audio_file is None:
audio_file, source = create_basic_audio(text)
if audio_file:
message = f"""
<div class="status-success">
<strong>βœ… Speech Generated!</strong><br>
Source: {source} β€’ Characters: {len(text)}<br>
Speed: {speed_val}x
</div>
"""
return audio_file, message
else:
return None, """
<div class="status-info">
<strong>❌ Failed to generate</strong><br>
Please try different text
</div>
"""
def clear_all():
return "", None, """
<div class="status-info">
<strong>Cleared</strong><br>
Ready for new text input
</div>
"""
# Connect buttons
generate_btn.click(
process_text,
[text_input, speed],
[audio_output, status]
)
clear_btn.click(
clear_all,
[],
[text_input, audio_output, status]
)
# Launch the app
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
show_error=True,
quiet=True
)