import gradio as gr import torch import numpy as np import tempfile import time import warnings warnings.filterwarnings("ignore") # HTML with inline CSS for white background and black text html_with_css = """

đŸŽĩ Text-to-Speech

Convert text to speech with smaller AI model

""" print("🚀 Starting TTS System...") # Try to load a SMALLER TTS model that fits in free tier def load_small_tts_model(): """Load a smaller TTS model that fits in Hugging Face Spaces free tier""" try: print("đŸ“Ĩ Loading smaller TTS model...") # Option 1: Try Coqui TTS (smaller footprint) try: from TTS.api import TTS # Using a small multilingual model tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False) print("✅ Loaded Coqui XTTS model") return ("coqui", tts_model) except ImportError: print(" Coqui TTS not available") # Option 2: Try SpeechT5 (smaller than VibeVoice) try: from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan import torch processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") # Use CPU to save memory model = model.to("cpu") vocoder = vocoder.to("cpu") print("✅ Loaded SpeechT5 model (CPU)") return ("speecht5", {"processor": processor, "model": model, "vocoder": vocoder}) except Exception as e: print(f" SpeechT5 failed: {e}") # Option 3: Try Bark (small and fast) try: from transformers import AutoProcessor, BarkModel import torch processor = AutoProcessor.from_pretrained("suno/bark-small") model = BarkModel.from_pretrained("suno/bark-small") # Use CPU model = model.to("cpu") print("✅ Loaded Bark model (CPU)") return ("bark", {"processor": processor, "model": model}) except Exception as e: print(f" Bark failed: {e}") print("âš ī¸ No small TTS model loaded, using gTTS fallback") return ("gtts", None) except Exception as e: print(f"❌ Error loading models: {e}") return ("gtts", None) # Load model model_type, tts_model = load_small_tts_model() def generate_with_model(text, speed=1.0): """Generate speech using the loaded model""" try: if not text or not text.strip(): return None, None print(f"🔊 Generating: {text[:50]}...") if model_type == "coqui" and tts_model: # Coqui TTS with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: tts_model.tts_to_file(text=text, file_path=f.name) return f.name, 24000 elif model_type == "speecht5" and tts_model: # SpeechT5 processor = tts_model["processor"] model = tts_model["model"] vocoder = tts_model["vocoder"] inputs = processor(text=text, return_tensors="pt") with torch.no_grad(): speech = model.generate_speech(inputs["input_ids"], vocoder=vocoder) audio = speech.numpy() with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: import scipy.io.wavfile scipy.io.wavfile.write(f.name, 16000, audio.astype(np.float32)) return f.name, 16000 elif model_type == "bark" and tts_model: # Bark processor = tts_model["processor"] model = tts_model["model"] inputs = processor(text, return_tensors="pt") with torch.no_grad(): audio_array = model.generate(**inputs) audio_array = audio_array.cpu().numpy().squeeze() with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: import scipy.io.wavfile scipy.io.wavfile.write(f.name, 24000, audio_array.astype(np.float32)) return f.name, 24000 return None, None except Exception as e: print(f"❌ Model generation error: {e}") return None, None def generate_with_gtts(text): """Fallback to gTTS (requires internet but works well)""" try: from gtts import gTTS with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f: tts = gTTS(text=text, lang='en', slow=False) tts.save(f.name) return f.name, "gTTS" except Exception as e: print(f"❌ gTTS error: {e}") return None, None def create_basic_audio(text): """Create basic audio as last resort""" import scipy.io.wavfile duration = min(len(text) * 0.05, 5) sr = 24000 t = np.linspace(0, duration, int(sr * duration)) # Create varied audio base_freq = 220 audio = np.zeros_like(t) for i, char in enumerate(text[:20]): freq = base_freq + (ord(char) % 300) amp = 0.3 / (i + 1) audio += amp * np.sin(2 * np.pi * freq * t) envelope = np.exp(-2 * t) * (1 - np.exp(-8 * t)) audio *= envelope with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: scipy.io.wavfile.write(f.name, sr, audio.astype(np.float32)) return f.name, "Basic" # Create the interface with gr.Blocks() as demo: # Add CSS as HTML gr.HTML(html_with_css) # Main layout with gr.Row(): # Input column with gr.Column(scale=2): gr.Markdown("### 📝 Enter Text") text_input = gr.Textbox( label="", placeholder="Type your text here... (Black text on white background)", lines=5 ) with gr.Row(): speed = gr.Slider( minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed" ) with gr.Row(): generate_btn = gr.Button("✨ Generate Speech", variant="primary") clear_btn = gr.Button("Clear", variant="secondary") # Output column with gr.Column(scale=1): gr.Markdown("### 🎧 Audio Output") audio_output = gr.Audio(type="filepath", label="") status = gr.HTML("""
Ready
Enter text and click Generate Speech
""") # Model info gr.Markdown("### â„šī¸ System Information") if model_type == "coqui": gr.Markdown("✅ **Model**: Coqui XTTS (Multilingual)") elif model_type == "speecht5": gr.Markdown("✅ **Model**: Microsoft SpeechT5") elif model_type == "bark": gr.Markdown("✅ **Model**: Suno Bark") elif model_type == "gtts": gr.Markdown("âš ī¸ **Model**: gTTS (Fallback - requires internet)") else: gr.Markdown("âš ī¸ **Model**: Basic audio generation") # Examples gr.Markdown("### 💡 Examples") gr.Examples( examples=[ ["Hello! Welcome to the text-to-speech system."], ["This is a demonstration of AI speech synthesis."], ["The quick brown fox jumps over the lazy dog."], ["Artificial intelligence is transforming technology."] ], inputs=text_input, label="Click to try:" ) # Event handlers def process_text(text, speed_val): if not text or not text.strip(): return None, """
âš ī¸ Please enter text
Type something in the text box above
""" print(f"Processing: {text[:50]}...") # Try model first audio_file, sr = generate_with_model(text, speed_val) source = "AI Model" # Fallback to gTTS if audio_file is None: audio_file, source = generate_with_gtts(text) # Last resort: basic audio if audio_file is None: audio_file, source = create_basic_audio(text) if audio_file: message = f"""
✅ Speech Generated!
Source: {source} â€ĸ Characters: {len(text)}
Speed: {speed_val}x
""" return audio_file, message else: return None, """
❌ Failed to generate
Please try different text
""" def clear_all(): return "", None, """
Cleared
Ready for new text input
""" # Connect buttons generate_btn.click( process_text, [text_input, speed], [audio_output, status] ) clear_btn.click( clear_all, [], [text_input, audio_output, status] ) # Launch the app if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, show_error=True, quiet=True )