Spaces:

Nick021402
/

Text2speech

Sleeping

App Files Files Community

Text2speech / app.py

Nick021402

Update app.py

c1fa46d verified 8 months ago

raw

history blame contribute delete

14.1 kB

	import gradio as gr
	import torch
	import numpy as np
	import re
	import soundfile as sf
	import tempfile
	import os
	import nltk
	from nltk.tokenize import sent_tokenize
	import warnings
	import time
	from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
	from datasets import load_dataset

	warnings.filterwarnings("ignore")

	# Download required NLTK data including punkt_tab
	try:
	nltk.data.find('tokenizers/punkt')
	nltk.data.find('tokenizers/punkt_tab')
	except LookupError:
	nltk.download(['punkt', 'punkt_tab'], quiet=True)


	class LongFormTTS:
	def __init__(self):
	print("🔄 Loading TTS models...")
	try:
	# Load SpeechT5 - most reliable for HF Spaces
	print("Loading SpeechT5 TTS...")
	self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
	self.model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
	self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
	# Load speaker embeddings dataset
	print("Loading speaker embeddings...")
	embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
	# Store multiple speakers
	self.speakers = {
	f"Speaker {i+1} ({id})": embeddings_dataset[id]["xvector"]
	for i, id in enumerate([7306, 7339, 7341, 7345, 7367, 7422])
	}
	self.speaker_ids = list(self.speakers.keys())
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	self.model = self.model.to(self.device)
	self.vocoder = self.vocoder.to(self.device)
	print("✅ SpeechT5 loaded successfully!")
	except Exception as e:
	print(f"❌ Failed to load SpeechT5: {e}")
	raise Exception(f"TTS model loading failed: {e}")

	def preprocess_text(self, text):
	"""Clean and prepare text for TTS"""
	text = re.sub(r'\s+', ' ', text.strip())
	abbreviations = {
	'Dr.': 'Doctor',
	'Mr.': 'Mister',
	'Mrs.': 'Missus',
	'Ms.': 'Miss',
	'Prof.': 'Professor',
	'etc.': 'etcetera',
	'vs.': 'versus',
	'e.g.': 'for example',
	'i.e.': 'that is',
	'St.': 'Street',
	'Ave.': 'Avenue',
	'Blvd.': 'Boulevard',
	'Inc.': 'Incorporated',
	'Corp.': 'Corporation',
	'Ltd.': 'Limited',
	'U.S.': 'United States',
	'U.K.': 'United Kingdom',
	'Ph.D.': 'PhD',
	'M.D.': 'MD',
	}
	for abbr, full in abbreviations.items():
	text = text.replace(abbr, full)
	text = re.sub(r'\b(\d{1,4})\b', lambda m: self.number_to_words(int(m.group())), text)
	text = re.sub(r'\b(1[0-9]{3}\|20[0-9]{2}\|2100)\b', lambda m: m.group(), text)
	text = re.sub(r'[^\w\s\.,!?;:\-\(\)\'"]', ' ', text)
	return text.strip()

	def number_to_words(self, num):
	ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
	teens = ["ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
	"sixteen", "seventeen", "eighteen", "nineteen"]
	tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
	if num == 0:
	return "zero"
	if num > 9999:
	return str(num)
	if num < 10:
	return ones[num]
	elif num < 20:
	return teens[num - 10]
	elif num < 100:
	return tens[num // 10] + ("" if num % 10 == 0 else " " + ones[num % 10])
	elif num < 1000:
	return ones[num // 100] + " hundred" + (" " + self.number_to_words(num % 100)).strip()
	else:
	thousands = num // 1000
	remainder = num % 1000
	result = self.number_to_words(thousands) + " thousand"
	if remainder > 0:
	result += " " + self.number_to_words(remainder)
	return result

	def chunk_text(self, text, max_length=400):
	"""Split text into manageable chunks"""
	sentences = sent_tokenize(text)
	chunks = []
	current_chunk = ""
	for sentence in sentences:
	sentence = sentence.strip()
	if not sentence:
	continue
	if len(current_chunk + " " + sentence) > max_length:
	if current_chunk:
	chunks.append(current_chunk.strip())
	if len(sentence) > max_length:
	words = sentence.split()
	temp_chunk = ""
	for word in words:
	if len(temp_chunk + " " + word) > max_length:
	if temp_chunk:
	chunks.append(temp_chunk.strip())
	temp_chunk = word
	else:
	chunks.append(word)
	else:
	temp_chunk = temp_chunk + " " + word if temp_chunk else word
	current_chunk = temp_chunk
	else:
	current_chunk = sentence
	else:
	current_chunk = current_chunk + " " + sentence if current_chunk else sentence
	if current_chunk:
	chunks.append(current_chunk.strip())
	return [chunk for chunk in chunks if chunk.strip()]

	def generate_speech_chunk(self, text_chunk, speaker_embedding):
	"""Generate speech for a single chunk"""
	try:
	inputs = self.processor(text=text_chunk, return_tensors="pt").to(self.device)
	with torch.no_grad():
	speech = self.model.generate_speech(
	inputs["input_ids"],
	torch.tensor(speaker_embedding).unsqueeze(0).to(self.device),
	vocoder=self.vocoder
	)
	if isinstance(speech, torch.Tensor):
	speech = speech.cpu().numpy()
	return speech
	except Exception as e:
	print(f"Error generating speech for chunk: {e}")
	print(f"Chunk text: {text_chunk}")
	return None

	def generate_long_speech(self, text, speaker_id=None, progress_callback=None):
	"""Generate speech for long text"""
	processed_text = self.preprocess_text(text)
	print(f"Original length: {len(text)}, Processed length: {len(processed_text)}")
	chunks = self.chunk_text(processed_text)
	print(f"Split into {len(chunks)} chunks")
	if not chunks:
	return None, None
	# Generate speech for each chunk
	audio_segments = []
	sample_rate = 16000
	for i, chunk in enumerate(chunks):
	if progress_callback:
	progress_callback(f"Processing chunk {i+1}/{len(chunks)}: {chunk[:40]}{'...' if len(chunk) > 40 else ''}")
	print(f"Processing chunk {i+1}: {chunk}")
	audio_chunk = self.generate_speech_chunk(chunk, self.speakers[speaker_id or self.speaker_ids[0]])
	if audio_chunk is not None and len(audio_chunk) > 0:
	if len(audio_chunk.shape) > 1:
	audio_chunk = np.mean(audio_chunk, axis=0)
	audio_segments.append(audio_chunk)
	pause_samples = int(0.4 * sample_rate)
	silence = np.zeros(pause_samples)
	audio_segments.append(silence)
	time.sleep(0.1)
	if not audio_segments:
	return None, None
	final_audio = np.concatenate(audio_segments)
	max_val = np.max(np.abs(final_audio))
	if max_val > 0:
	final_audio = final_audio / max_val * 0.95
	return final_audio, sample_rate


	# Global TTS system
	print("🚀 Initializing TTS system...")
	try:
	tts_system = LongFormTTS()
	print("✅ TTS system ready!")
	except Exception as e:
	print(f"❌ TTS initialization failed: {e}")
	tts_system = None


	def text_to_speech_interface(text, speaker="Speaker 1 (7306)", progress=gr.Progress()):
	"""Main Gradio interface function"""
	if tts_system is None:
	return None, "❌ TTS system is not available. Please check the logs."
	if not text or not text.strip():
	return None, "⚠️ Please enter some text to convert to speech."
	if len(text) > 50000:
	return None, "⚠️ Text is too long. Please keep it under 50,000 characters."

	def progress_callback(message):
	progress(0.5, desc=message)

	try:
	progress(0.1, desc="🔄 Starting text-to-speech conversion...")
	audio, sample_rate = tts_system.generate_long_speech(text, speaker, progress_callback)
	if audio is None or len(audio) == 0:
	return None, "❌ Failed to generate audio."
	progress(0.9, desc="💾 Saving audio file...")
	with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
	sf.write(tmp_file.name, audio, sample_rate)
	audio_path = tmp_file.name
	progress(1.0, desc="✅ Complete!")
	duration = len(audio) / sample_rate
	return audio_path, f"✅ Generated {duration:.1f} seconds of audio successfully!"
	except Exception as e:
	error_msg = f"❌ Error: {str(e)}"
	print(f"TTS Error: {e}")
	return None, error_msg


	# Create Gradio interface
	def create_interface():
	with gr.Blocks(
	title="🎤 Long-Form Text-to-Speech",
	theme=gr.themes.Soft(),
	css="""
	.main-header {
	text-align: center;
	margin-bottom: 2rem;
	background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	background-clip: text;
	}
	"""
	) as demo:
	gr.HTML("""
	<div class="main-header">
	<h1>🎤 Long-Form Text-to-Speech Generator</h1>
	<p style="color: #666; font-size: 1.1em;">Transform any text into natural human-like speech using advanced AI</p>
	</div>
	""")
	# System status
	if tts_system:
	gr.HTML("""
	<div style="padding: 1rem; border-radius: 10px; margin: 1rem 0; border-left: 4px solid #28a745; background: #f8f9fa;">
	<h4>🟢 System Ready</h4>
	<p>Using <strong>Microsoft SpeechT5</strong> - High quality neural text-to-speech</p>
	</div>
	""")
	else:
	gr.HTML("""
	<div style="padding: 1rem; border-radius: 10px; margin: 1rem 0; border-left: 4px solid #dc3545; background: #f8d7da;">
	<h4>🔴 System Error</h4>
	<p>TTS system failed to initialize. Please refresh the page.</p>
	</div>
	""")
	with gr.Row():
	with gr.Column(scale=2):
	text_input = gr.Textbox(
	label="📝 Enter Your Text",
	placeholder="Type or paste your text here... (Max 50,000 characters)",
	lines=10,
	max_lines=20,
	info="Supports any length text with automatic chunking for optimal quality"
	)
	char_count = gr.HTML("<span style='color: #666;'>Character count: 0 / 50,000</span>")
	speaker_dropdown = gr.Dropdown(
	choices=tts_system.speaker_ids if tts_system else [],
	value=tts_system.speaker_ids[0] if tts_system and tts_system.speaker_ids else None,
	label="🗣️ Choose Voice"
	)
	generate_btn = gr.Button("🎯 Generate Speech", variant="primary", size="lg", scale=1)
	with gr.Column(scale=1):
	gr.HTML("""
	<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1.5rem; border-radius: 15px; margin: 1rem 0; box-shadow: 0 4px 15px rgba(0,0,0,0.1);">
	<h3>✨ Key Features</h3>
	<ul style="margin: 0; padding-left: 1.2em;">
	<li>🚀 Handles long texts</li>
	<li>🎭 Multiple human voices</li>
	<li>⚡ Smart text processing</li>
	<li>🔧 Auto chunking</li>
	<li>🎵 Natural-sounding speech</li>
	<li>🔊 MP3 audio output</li>
	</ul>
	</div>
	""")
	status_output = gr.Textbox(label="📊 Status", interactive=False, value="Ready to generate speech! Enter some text above.")
	audio_output = gr.Audio(label="🔊 Generated Speech", type="filepath", show_download_button=True)

	def update_char_count(text):
	count = len(text) if text else 0
	color = "#28a745" if count <= 50000 else "#dc3545"
	return f'<span style="color: {color};">Character count: {count:,} / 50,000</span>'

	text_input.change(fn=update_char_count, inputs=[text_input], outputs=[char_count])

	generate_btn.click(
	fn=text_to_speech_interface,
	inputs=[text_input, speaker_dropdown],
	outputs=[audio_output, status_output],
	show_progress=True
	)

	gr.Examples(
	examples=[
	["Hello! Welcome to our advanced text-to-speech system.", "Speaker 1 (7306)"],
	["The quick brown fox jumps over the lazy dog.", "Speaker 2 (7339)"],
	["Artificial intelligence has revolutionized many aspects of our lives.", "Speaker 3 (7341)"],
	],
	inputs=[text_input, speaker_dropdown],
	label="📚 Try These Examples"
	)

	return demo


	# Launch the application
	if __name__ == "__main__":
	demo = create_interface()
	demo.launch(server_name="0.0.0.0", server_port=7860, share=True)