Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| import numpy as np | |
| import re | |
| import soundfile as sf | |
| import tempfile | |
| import os | |
| import nltk | |
| from nltk.tokenize import sent_tokenize | |
| import warnings | |
| import time | |
| from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
| from datasets import load_dataset | |
| warnings.filterwarnings("ignore") | |
| # Download required NLTK data including punkt_tab | |
| try: | |
| nltk.data.find('tokenizers/punkt') | |
| nltk.data.find('tokenizers/punkt_tab') | |
| except LookupError: | |
| nltk.download(['punkt', 'punkt_tab'], quiet=True) | |
| class LongFormTTS: | |
| def __init__(self): | |
| print("π Loading TTS models...") | |
| try: | |
| # Load SpeechT5 - most reliable for HF Spaces | |
| print("Loading SpeechT5 TTS...") | |
| self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
| self.model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") | |
| self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
| # Load speaker embeddings dataset | |
| print("Loading speaker embeddings...") | |
| embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
| # Store multiple speakers | |
| self.speakers = { | |
| f"Speaker {i+1} ({id})": embeddings_dataset[id]["xvector"] | |
| for i, id in enumerate([7306, 7339, 7341, 7345, 7367, 7422]) | |
| } | |
| self.speaker_ids = list(self.speakers.keys()) | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| self.model = self.model.to(self.device) | |
| self.vocoder = self.vocoder.to(self.device) | |
| print("β SpeechT5 loaded successfully!") | |
| except Exception as e: | |
| print(f"β Failed to load SpeechT5: {e}") | |
| raise Exception(f"TTS model loading failed: {e}") | |
| def preprocess_text(self, text): | |
| """Clean and prepare text for TTS""" | |
| text = re.sub(r'\s+', ' ', text.strip()) | |
| abbreviations = { | |
| 'Dr.': 'Doctor', | |
| 'Mr.': 'Mister', | |
| 'Mrs.': 'Missus', | |
| 'Ms.': 'Miss', | |
| 'Prof.': 'Professor', | |
| 'etc.': 'etcetera', | |
| 'vs.': 'versus', | |
| 'e.g.': 'for example', | |
| 'i.e.': 'that is', | |
| 'St.': 'Street', | |
| 'Ave.': 'Avenue', | |
| 'Blvd.': 'Boulevard', | |
| 'Inc.': 'Incorporated', | |
| 'Corp.': 'Corporation', | |
| 'Ltd.': 'Limited', | |
| 'U.S.': 'United States', | |
| 'U.K.': 'United Kingdom', | |
| 'Ph.D.': 'PhD', | |
| 'M.D.': 'MD', | |
| } | |
| for abbr, full in abbreviations.items(): | |
| text = text.replace(abbr, full) | |
| text = re.sub(r'\b(\d{1,4})\b', lambda m: self.number_to_words(int(m.group())), text) | |
| text = re.sub(r'\b(1[0-9]{3}|20[0-9]{2}|2100)\b', lambda m: m.group(), text) | |
| text = re.sub(r'[^\w\s\.,!?;:\-\(\)\'"]', ' ', text) | |
| return text.strip() | |
| def number_to_words(self, num): | |
| ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"] | |
| teens = ["ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", | |
| "sixteen", "seventeen", "eighteen", "nineteen"] | |
| tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"] | |
| if num == 0: | |
| return "zero" | |
| if num > 9999: | |
| return str(num) | |
| if num < 10: | |
| return ones[num] | |
| elif num < 20: | |
| return teens[num - 10] | |
| elif num < 100: | |
| return tens[num // 10] + ("" if num % 10 == 0 else " " + ones[num % 10]) | |
| elif num < 1000: | |
| return ones[num // 100] + " hundred" + (" " + self.number_to_words(num % 100)).strip() | |
| else: | |
| thousands = num // 1000 | |
| remainder = num % 1000 | |
| result = self.number_to_words(thousands) + " thousand" | |
| if remainder > 0: | |
| result += " " + self.number_to_words(remainder) | |
| return result | |
| def chunk_text(self, text, max_length=400): | |
| """Split text into manageable chunks""" | |
| sentences = sent_tokenize(text) | |
| chunks = [] | |
| current_chunk = "" | |
| for sentence in sentences: | |
| sentence = sentence.strip() | |
| if not sentence: | |
| continue | |
| if len(current_chunk + " " + sentence) > max_length: | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| if len(sentence) > max_length: | |
| words = sentence.split() | |
| temp_chunk = "" | |
| for word in words: | |
| if len(temp_chunk + " " + word) > max_length: | |
| if temp_chunk: | |
| chunks.append(temp_chunk.strip()) | |
| temp_chunk = word | |
| else: | |
| chunks.append(word) | |
| else: | |
| temp_chunk = temp_chunk + " " + word if temp_chunk else word | |
| current_chunk = temp_chunk | |
| else: | |
| current_chunk = sentence | |
| else: | |
| current_chunk = current_chunk + " " + sentence if current_chunk else sentence | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| return [chunk for chunk in chunks if chunk.strip()] | |
| def generate_speech_chunk(self, text_chunk, speaker_embedding): | |
| """Generate speech for a single chunk""" | |
| try: | |
| inputs = self.processor(text=text_chunk, return_tensors="pt").to(self.device) | |
| with torch.no_grad(): | |
| speech = self.model.generate_speech( | |
| inputs["input_ids"], | |
| torch.tensor(speaker_embedding).unsqueeze(0).to(self.device), | |
| vocoder=self.vocoder | |
| ) | |
| if isinstance(speech, torch.Tensor): | |
| speech = speech.cpu().numpy() | |
| return speech | |
| except Exception as e: | |
| print(f"Error generating speech for chunk: {e}") | |
| print(f"Chunk text: {text_chunk}") | |
| return None | |
| def generate_long_speech(self, text, speaker_id=None, progress_callback=None): | |
| """Generate speech for long text""" | |
| processed_text = self.preprocess_text(text) | |
| print(f"Original length: {len(text)}, Processed length: {len(processed_text)}") | |
| chunks = self.chunk_text(processed_text) | |
| print(f"Split into {len(chunks)} chunks") | |
| if not chunks: | |
| return None, None | |
| # Generate speech for each chunk | |
| audio_segments = [] | |
| sample_rate = 16000 | |
| for i, chunk in enumerate(chunks): | |
| if progress_callback: | |
| progress_callback(f"Processing chunk {i+1}/{len(chunks)}: {chunk[:40]}{'...' if len(chunk) > 40 else ''}") | |
| print(f"Processing chunk {i+1}: {chunk}") | |
| audio_chunk = self.generate_speech_chunk(chunk, self.speakers[speaker_id or self.speaker_ids[0]]) | |
| if audio_chunk is not None and len(audio_chunk) > 0: | |
| if len(audio_chunk.shape) > 1: | |
| audio_chunk = np.mean(audio_chunk, axis=0) | |
| audio_segments.append(audio_chunk) | |
| pause_samples = int(0.4 * sample_rate) | |
| silence = np.zeros(pause_samples) | |
| audio_segments.append(silence) | |
| time.sleep(0.1) | |
| if not audio_segments: | |
| return None, None | |
| final_audio = np.concatenate(audio_segments) | |
| max_val = np.max(np.abs(final_audio)) | |
| if max_val > 0: | |
| final_audio = final_audio / max_val * 0.95 | |
| return final_audio, sample_rate | |
| # Global TTS system | |
| print("π Initializing TTS system...") | |
| try: | |
| tts_system = LongFormTTS() | |
| print("β TTS system ready!") | |
| except Exception as e: | |
| print(f"β TTS initialization failed: {e}") | |
| tts_system = None | |
| def text_to_speech_interface(text, speaker="Speaker 1 (7306)", progress=gr.Progress()): | |
| """Main Gradio interface function""" | |
| if tts_system is None: | |
| return None, "β TTS system is not available. Please check the logs." | |
| if not text or not text.strip(): | |
| return None, "β οΈ Please enter some text to convert to speech." | |
| if len(text) > 50000: | |
| return None, "β οΈ Text is too long. Please keep it under 50,000 characters." | |
| def progress_callback(message): | |
| progress(0.5, desc=message) | |
| try: | |
| progress(0.1, desc="π Starting text-to-speech conversion...") | |
| audio, sample_rate = tts_system.generate_long_speech(text, speaker, progress_callback) | |
| if audio is None or len(audio) == 0: | |
| return None, "β Failed to generate audio." | |
| progress(0.9, desc="πΎ Saving audio file...") | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file: | |
| sf.write(tmp_file.name, audio, sample_rate) | |
| audio_path = tmp_file.name | |
| progress(1.0, desc="β Complete!") | |
| duration = len(audio) / sample_rate | |
| return audio_path, f"β Generated {duration:.1f} seconds of audio successfully!" | |
| except Exception as e: | |
| error_msg = f"β Error: {str(e)}" | |
| print(f"TTS Error: {e}") | |
| return None, error_msg | |
| # Create Gradio interface | |
| def create_interface(): | |
| with gr.Blocks( | |
| title="π€ Long-Form Text-to-Speech", | |
| theme=gr.themes.Soft(), | |
| css=""" | |
| .main-header { | |
| text-align: center; | |
| margin-bottom: 2rem; | |
| background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| background-clip: text; | |
| } | |
| """ | |
| ) as demo: | |
| gr.HTML(""" | |
| <div class="main-header"> | |
| <h1>π€ Long-Form Text-to-Speech Generator</h1> | |
| <p style="color: #666; font-size: 1.1em;">Transform any text into natural human-like speech using advanced AI</p> | |
| </div> | |
| """) | |
| # System status | |
| if tts_system: | |
| gr.HTML(""" | |
| <div style="padding: 1rem; border-radius: 10px; margin: 1rem 0; border-left: 4px solid #28a745; background: #f8f9fa;"> | |
| <h4>π’ System Ready</h4> | |
| <p>Using <strong>Microsoft SpeechT5</strong> - High quality neural text-to-speech</p> | |
| </div> | |
| """) | |
| else: | |
| gr.HTML(""" | |
| <div style="padding: 1rem; border-radius: 10px; margin: 1rem 0; border-left: 4px solid #dc3545; background: #f8d7da;"> | |
| <h4>π΄ System Error</h4> | |
| <p>TTS system failed to initialize. Please refresh the page.</p> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| text_input = gr.Textbox( | |
| label="π Enter Your Text", | |
| placeholder="Type or paste your text here... (Max 50,000 characters)", | |
| lines=10, | |
| max_lines=20, | |
| info="Supports any length text with automatic chunking for optimal quality" | |
| ) | |
| char_count = gr.HTML("<span style='color: #666;'>Character count: 0 / 50,000</span>") | |
| speaker_dropdown = gr.Dropdown( | |
| choices=tts_system.speaker_ids if tts_system else [], | |
| value=tts_system.speaker_ids[0] if tts_system and tts_system.speaker_ids else None, | |
| label="π£οΈ Choose Voice" | |
| ) | |
| generate_btn = gr.Button("π― Generate Speech", variant="primary", size="lg", scale=1) | |
| with gr.Column(scale=1): | |
| gr.HTML(""" | |
| <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1.5rem; border-radius: 15px; margin: 1rem 0; box-shadow: 0 4px 15px rgba(0,0,0,0.1);"> | |
| <h3>β¨ Key Features</h3> | |
| <ul style="margin: 0; padding-left: 1.2em;"> | |
| <li>π Handles long texts</li> | |
| <li>π Multiple human voices</li> | |
| <li>β‘ Smart text processing</li> | |
| <li>π§ Auto chunking</li> | |
| <li>π΅ Natural-sounding speech</li> | |
| <li>π MP3 audio output</li> | |
| </ul> | |
| </div> | |
| """) | |
| status_output = gr.Textbox(label="π Status", interactive=False, value="Ready to generate speech! Enter some text above.") | |
| audio_output = gr.Audio(label="π Generated Speech", type="filepath", show_download_button=True) | |
| def update_char_count(text): | |
| count = len(text) if text else 0 | |
| color = "#28a745" if count <= 50000 else "#dc3545" | |
| return f'<span style="color: {color};">Character count: {count:,} / 50,000</span>' | |
| text_input.change(fn=update_char_count, inputs=[text_input], outputs=[char_count]) | |
| generate_btn.click( | |
| fn=text_to_speech_interface, | |
| inputs=[text_input, speaker_dropdown], | |
| outputs=[audio_output, status_output], | |
| show_progress=True | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| ["Hello! Welcome to our advanced text-to-speech system.", "Speaker 1 (7306)"], | |
| ["The quick brown fox jumps over the lazy dog.", "Speaker 2 (7339)"], | |
| ["Artificial intelligence has revolutionized many aspects of our lives.", "Speaker 3 (7341)"], | |
| ], | |
| inputs=[text_input, speaker_dropdown], | |
| label="π Try These Examples" | |
| ) | |
| return demo | |
| # Launch the application | |
| if __name__ == "__main__": | |
| demo = create_interface() | |
| demo.launch(server_name="0.0.0.0", server_port=7860, share=True) |