Spaces:

DevNumb
/

TextTOVoiceConv

Sleeping

App Files Files Community

TextTOVoiceConv / app.py

DevNumb

Update app.py

afd6946 verified 2 months ago

raw

history blame contribute delete

12 kB

	import gradio as gr
	import torch
	import numpy as np
	import tempfile
	import time
	import warnings
	warnings.filterwarnings("ignore")

	# HTML with inline CSS for white background and black text
	html_with_css = """
	<!DOCTYPE html>
	<html>
	<head>
	<style>
	body, .gradio-container {
	background: white !important;
	color: #333333 !important;
	font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
	margin: 0;
	padding: 20px;
	}

	.header {
	text-align: center;
	padding: 2rem;
	background: linear-gradient(135deg, #4F46E5 0%, #7C3AED 100%);
	border-radius: 16px;
	margin-bottom: 2rem;
	color: white;
	}

	.header h1 {
	font-size: 2.5em;
	margin: 0 0 0.5rem 0;
	font-weight: 700;
	}

	/* BLACK TEXT ON WHITE - MOST IMPORTANT */
	textarea {
	background: white !important;
	border: 2px solid #4F46E5 !important;
	border-radius: 12px !important;
	color: #000000 !important; /* Pure black text */
	padding: 1rem !important;
	font-size: 16px !important;
	width: 100% !important;
	min-height: 120px !important;
	font-family: monospace !important;
	}

	textarea::placeholder {
	color: #666666 !important;
	}

	button {
	padding: 0.75rem 1.5rem !important;
	border-radius: 10px !important;
	font-weight: 600 !important;
	margin: 0.5rem !important;
	cursor: pointer !important;
	}

	.primary-btn {
	background: linear-gradient(135deg, #4F46E5 0%, #7C3AED 100%) !important;
	border: none !important;
	color: white !important;
	}

	.secondary-btn {
	background: white !important;
	border: 2px solid #D1D5DB !important;
	color: #374151 !important;
	}

	.card {
	background: white;
	border: 1px solid #E5E7EB;
	border-radius: 12px;
	padding: 1.5rem;
	margin-bottom: 1rem;
	}

	.status-success {
	background: #DCFCE7;
	border: 1px solid #86EFAC;
	border-left: 4px solid #10B981;
	color: #065F46;
	padding: 1rem;
	border-radius: 8px;
	margin: 1rem 0;
	}

	.status-info {
	background: #DBEAFE;
	border: 1px solid #93C5FD;
	border-left: 4px solid #3B82F6;
	color: #1E40AF;
	padding: 1rem;
	border-radius: 8px;
	margin: 1rem 0;
	}
	</style>
	</head>
	<body>
	<div class="header">
	<h1>🎵 Text-to-Speech</h1>
	<p>Convert text to speech with smaller AI model</p>
	</div>
	</body>
	</html>
	"""

	print("🚀 Starting TTS System...")

	# Try to load a SMALLER TTS model that fits in free tier
	def load_small_tts_model():
	"""Load a smaller TTS model that fits in Hugging Face Spaces free tier"""
	try:
	print("📥 Loading smaller TTS model...")

	# Option 1: Try Coqui TTS (smaller footprint)
	try:
	from TTS.api import TTS
	# Using a small multilingual model
	tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False)
	print("✅ Loaded Coqui XTTS model")
	return ("coqui", tts_model)
	except ImportError:
	print(" Coqui TTS not available")

	# Option 2: Try SpeechT5 (smaller than VibeVoice)
	try:
	from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
	import torch

	processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
	model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
	vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

	# Use CPU to save memory
	model = model.to("cpu")
	vocoder = vocoder.to("cpu")

	print("✅ Loaded SpeechT5 model (CPU)")
	return ("speecht5", {"processor": processor, "model": model, "vocoder": vocoder})
	except Exception as e:
	print(f" SpeechT5 failed: {e}")

	# Option 3: Try Bark (small and fast)
	try:
	from transformers import AutoProcessor, BarkModel
	import torch

	processor = AutoProcessor.from_pretrained("suno/bark-small")
	model = BarkModel.from_pretrained("suno/bark-small")

	# Use CPU
	model = model.to("cpu")

	print("✅ Loaded Bark model (CPU)")
	return ("bark", {"processor": processor, "model": model})
	except Exception as e:
	print(f" Bark failed: {e}")

	print("⚠️ No small TTS model loaded, using gTTS fallback")
	return ("gtts", None)

	except Exception as e:
	print(f"❌ Error loading models: {e}")
	return ("gtts", None)

	# Load model
	model_type, tts_model = load_small_tts_model()

	def generate_with_model(text, speed=1.0):
	"""Generate speech using the loaded model"""
	try:
	if not text or not text.strip():
	return None, None

	print(f"🔊 Generating: {text[:50]}...")

	if model_type == "coqui" and tts_model:
	# Coqui TTS
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
	tts_model.tts_to_file(text=text, file_path=f.name)
	return f.name, 24000

	elif model_type == "speecht5" and tts_model:
	# SpeechT5
	processor = tts_model["processor"]
	model = tts_model["model"]
	vocoder = tts_model["vocoder"]

	inputs = processor(text=text, return_tensors="pt")

	with torch.no_grad():
	speech = model.generate_speech(inputs["input_ids"], vocoder=vocoder)

	audio = speech.numpy()

	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
	import scipy.io.wavfile
	scipy.io.wavfile.write(f.name, 16000, audio.astype(np.float32))
	return f.name, 16000

	elif model_type == "bark" and tts_model:
	# Bark
	processor = tts_model["processor"]
	model = tts_model["model"]

	inputs = processor(text, return_tensors="pt")

	with torch.no_grad():
	audio_array = model.generate(**inputs)
	audio_array = audio_array.cpu().numpy().squeeze()

	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
	import scipy.io.wavfile
	scipy.io.wavfile.write(f.name, 24000, audio_array.astype(np.float32))
	return f.name, 24000

	return None, None

	except Exception as e:
	print(f"❌ Model generation error: {e}")
	return None, None

	def generate_with_gtts(text):
	"""Fallback to gTTS (requires internet but works well)"""
	try:
	from gtts import gTTS

	with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
	tts = gTTS(text=text, lang='en', slow=False)
	tts.save(f.name)
	return f.name, "gTTS"
	except Exception as e:
	print(f"❌ gTTS error: {e}")
	return None, None

	def create_basic_audio(text):
	"""Create basic audio as last resort"""
	import scipy.io.wavfile

	duration = min(len(text) * 0.05, 5)
	sr = 24000
	t = np.linspace(0, duration, int(sr * duration))

	# Create varied audio
	base_freq = 220
	audio = np.zeros_like(t)

	for i, char in enumerate(text[:20]):
	freq = base_freq + (ord(char) % 300)
	amp = 0.3 / (i + 1)
	audio += amp * np.sin(2 * np.pi * freq * t)

	envelope = np.exp(-2 * t) * (1 - np.exp(-8 * t))
	audio *= envelope

	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
	scipy.io.wavfile.write(f.name, sr, audio.astype(np.float32))
	return f.name, "Basic"

	# Create the interface
	with gr.Blocks() as demo:
	# Add CSS as HTML
	gr.HTML(html_with_css)

	# Main layout
	with gr.Row():
	# Input column
	with gr.Column(scale=2):
	gr.Markdown("### 📝 Enter Text")
	text_input = gr.Textbox(
	label="",
	placeholder="Type your text here... (Black text on white background)",
	lines=5
	)

	with gr.Row():
	speed = gr.Slider(
	minimum=0.5,
	maximum=2.0,
	value=1.0,
	step=0.1,
	label="Speed"
	)

	with gr.Row():
	generate_btn = gr.Button("✨ Generate Speech", variant="primary")
	clear_btn = gr.Button("Clear", variant="secondary")

	# Output column
	with gr.Column(scale=1):
	gr.Markdown("### 🎧 Audio Output")
	audio_output = gr.Audio(type="filepath", label="")
	status = gr.HTML("""
	<div class="status-info">
	<strong>Ready</strong><br>
	Enter text and click Generate Speech
	</div>
	""")

	# Model info
	gr.Markdown("### ℹ️ System Information")

	if model_type == "coqui":
	gr.Markdown("✅ Model: Coqui XTTS (Multilingual)")
	elif model_type == "speecht5":
	gr.Markdown("✅ Model: Microsoft SpeechT5")
	elif model_type == "bark":
	gr.Markdown("✅ Model: Suno Bark")
	elif model_type == "gtts":
	gr.Markdown("⚠️ Model: gTTS (Fallback - requires internet)")
	else:
	gr.Markdown("⚠️ Model: Basic audio generation")

	# Examples
	gr.Markdown("### 💡 Examples")
	gr.Examples(
	examples=[
	["Hello! Welcome to the text-to-speech system."],
	["This is a demonstration of AI speech synthesis."],
	["The quick brown fox jumps over the lazy dog."],
	["Artificial intelligence is transforming technology."]
	],
	inputs=text_input,
	label="Click to try:"
	)

	# Event handlers
	def process_text(text, speed_val):
	if not text or not text.strip():
	return None, """
	<div class="status-info">
	<strong>⚠️ Please enter text</strong><br>
	Type something in the text box above
	</div>
	"""

	print(f"Processing: {text[:50]}...")

	# Try model first
	audio_file, sr = generate_with_model(text, speed_val)
	source = "AI Model"

	# Fallback to gTTS
	if audio_file is None:
	audio_file, source = generate_with_gtts(text)

	# Last resort: basic audio
	if audio_file is None:
	audio_file, source = create_basic_audio(text)

	if audio_file:
	message = f"""
	<div class="status-success">
	<strong>✅ Speech Generated!</strong><br>
	Source: {source} • Characters: {len(text)}<br>
	Speed: {speed_val}x
	</div>
	"""
	return audio_file, message
	else:
	return None, """
	<div class="status-info">
	<strong>❌ Failed to generate</strong><br>
	Please try different text
	</div>
	"""

	def clear_all():
	return "", None, """
	<div class="status-info">
	<strong>Cleared</strong><br>
	Ready for new text input
	</div>
	"""

	# Connect buttons
	generate_btn.click(
	process_text,
	[text_input, speed],
	[audio_output, status]
	)

	clear_btn.click(
	clear_all,
	[],
	[text_input, audio_output, status]
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	show_error=True,
	quiet=True
	)