import gradio as gr
import torch
import numpy as np
import tempfile
import time
import warnings
warnings.filterwarnings("ignore")
# HTML with inline CSS for white background and black text
html_with_css = """
"""
print("đ Starting TTS System...")
# Try to load a SMALLER TTS model that fits in free tier
def load_small_tts_model():
"""Load a smaller TTS model that fits in Hugging Face Spaces free tier"""
try:
print("đĨ Loading smaller TTS model...")
# Option 1: Try Coqui TTS (smaller footprint)
try:
from TTS.api import TTS
# Using a small multilingual model
tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False)
print("â
Loaded Coqui XTTS model")
return ("coqui", tts_model)
except ImportError:
print(" Coqui TTS not available")
# Option 2: Try SpeechT5 (smaller than VibeVoice)
try:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import torch
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# Use CPU to save memory
model = model.to("cpu")
vocoder = vocoder.to("cpu")
print("â
Loaded SpeechT5 model (CPU)")
return ("speecht5", {"processor": processor, "model": model, "vocoder": vocoder})
except Exception as e:
print(f" SpeechT5 failed: {e}")
# Option 3: Try Bark (small and fast)
try:
from transformers import AutoProcessor, BarkModel
import torch
processor = AutoProcessor.from_pretrained("suno/bark-small")
model = BarkModel.from_pretrained("suno/bark-small")
# Use CPU
model = model.to("cpu")
print("â
Loaded Bark model (CPU)")
return ("bark", {"processor": processor, "model": model})
except Exception as e:
print(f" Bark failed: {e}")
print("â ī¸ No small TTS model loaded, using gTTS fallback")
return ("gtts", None)
except Exception as e:
print(f"â Error loading models: {e}")
return ("gtts", None)
# Load model
model_type, tts_model = load_small_tts_model()
def generate_with_model(text, speed=1.0):
"""Generate speech using the loaded model"""
try:
if not text or not text.strip():
return None, None
print(f"đ Generating: {text[:50]}...")
if model_type == "coqui" and tts_model:
# Coqui TTS
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
tts_model.tts_to_file(text=text, file_path=f.name)
return f.name, 24000
elif model_type == "speecht5" and tts_model:
# SpeechT5
processor = tts_model["processor"]
model = tts_model["model"]
vocoder = tts_model["vocoder"]
inputs = processor(text=text, return_tensors="pt")
with torch.no_grad():
speech = model.generate_speech(inputs["input_ids"], vocoder=vocoder)
audio = speech.numpy()
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
import scipy.io.wavfile
scipy.io.wavfile.write(f.name, 16000, audio.astype(np.float32))
return f.name, 16000
elif model_type == "bark" and tts_model:
# Bark
processor = tts_model["processor"]
model = tts_model["model"]
inputs = processor(text, return_tensors="pt")
with torch.no_grad():
audio_array = model.generate(**inputs)
audio_array = audio_array.cpu().numpy().squeeze()
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
import scipy.io.wavfile
scipy.io.wavfile.write(f.name, 24000, audio_array.astype(np.float32))
return f.name, 24000
return None, None
except Exception as e:
print(f"â Model generation error: {e}")
return None, None
def generate_with_gtts(text):
"""Fallback to gTTS (requires internet but works well)"""
try:
from gtts import gTTS
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
tts = gTTS(text=text, lang='en', slow=False)
tts.save(f.name)
return f.name, "gTTS"
except Exception as e:
print(f"â gTTS error: {e}")
return None, None
def create_basic_audio(text):
"""Create basic audio as last resort"""
import scipy.io.wavfile
duration = min(len(text) * 0.05, 5)
sr = 24000
t = np.linspace(0, duration, int(sr * duration))
# Create varied audio
base_freq = 220
audio = np.zeros_like(t)
for i, char in enumerate(text[:20]):
freq = base_freq + (ord(char) % 300)
amp = 0.3 / (i + 1)
audio += amp * np.sin(2 * np.pi * freq * t)
envelope = np.exp(-2 * t) * (1 - np.exp(-8 * t))
audio *= envelope
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
scipy.io.wavfile.write(f.name, sr, audio.astype(np.float32))
return f.name, "Basic"
# Create the interface
with gr.Blocks() as demo:
# Add CSS as HTML
gr.HTML(html_with_css)
# Main layout
with gr.Row():
# Input column
with gr.Column(scale=2):
gr.Markdown("### đ Enter Text")
text_input = gr.Textbox(
label="",
placeholder="Type your text here... (Black text on white background)",
lines=5
)
with gr.Row():
speed = gr.Slider(
minimum=0.5,
maximum=2.0,
value=1.0,
step=0.1,
label="Speed"
)
with gr.Row():
generate_btn = gr.Button("⨠Generate Speech", variant="primary")
clear_btn = gr.Button("Clear", variant="secondary")
# Output column
with gr.Column(scale=1):
gr.Markdown("### đ§ Audio Output")
audio_output = gr.Audio(type="filepath", label="")
status = gr.HTML("""
Ready
Enter text and click Generate Speech
""")
# Model info
gr.Markdown("### âšī¸ System Information")
if model_type == "coqui":
gr.Markdown("â
**Model**: Coqui XTTS (Multilingual)")
elif model_type == "speecht5":
gr.Markdown("â
**Model**: Microsoft SpeechT5")
elif model_type == "bark":
gr.Markdown("â
**Model**: Suno Bark")
elif model_type == "gtts":
gr.Markdown("â ī¸ **Model**: gTTS (Fallback - requires internet)")
else:
gr.Markdown("â ī¸ **Model**: Basic audio generation")
# Examples
gr.Markdown("### đĄ Examples")
gr.Examples(
examples=[
["Hello! Welcome to the text-to-speech system."],
["This is a demonstration of AI speech synthesis."],
["The quick brown fox jumps over the lazy dog."],
["Artificial intelligence is transforming technology."]
],
inputs=text_input,
label="Click to try:"
)
# Event handlers
def process_text(text, speed_val):
if not text or not text.strip():
return None, """
â ī¸ Please enter text
Type something in the text box above
"""
print(f"Processing: {text[:50]}...")
# Try model first
audio_file, sr = generate_with_model(text, speed_val)
source = "AI Model"
# Fallback to gTTS
if audio_file is None:
audio_file, source = generate_with_gtts(text)
# Last resort: basic audio
if audio_file is None:
audio_file, source = create_basic_audio(text)
if audio_file:
message = f"""
â
Speech Generated!
Source: {source} âĸ Characters: {len(text)}
Speed: {speed_val}x
"""
return audio_file, message
else:
return None, """
â Failed to generate
Please try different text
"""
def clear_all():
return "", None, """
Cleared
Ready for new text input
"""
# Connect buttons
generate_btn.click(
process_text,
[text_input, speed],
[audio_output, status]
)
clear_btn.click(
clear_all,
[],
[text_input, audio_output, status]
)
# Launch the app
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
show_error=True,
quiet=True
)