Spaces:

Nick021402
/

Text2speech

Sleeping

App Files Files Community

Nick021402 commited on May 23, 2025

Commit

e392e54

verified ·

1 Parent(s): 73198e4

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -76

app.py CHANGED Viewed

@@ -20,39 +20,34 @@ try:
 except LookupError:
     nltk.download('punkt')
 class LongFormTTS:
     def __init__(self):
         print("🔄 Loading TTS models...")
         try:
             # Load SpeechT5 - most reliable for HF Spaces
             print("Loading SpeechT5 TTS...")
             self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
             self.model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
             self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
             # Load speaker embeddings
             print("Loading speaker embeddings...")
             embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
             # Use a different speaker embedding for more variety
             self.speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
             self.device = "cuda" if torch.cuda.is_available() else "cpu"
             self.model = self.model.to(self.device)
             self.vocoder = self.vocoder.to(self.device)
             self.speaker_embeddings = self.speaker_embeddings.to(self.device)
             print("✅ SpeechT5 loaded successfully!")
         except Exception as e:
             print(f"❌ Failed to load SpeechT5: {e}")
             raise Exception(f"TTS model loading failed: {e}")
     def preprocess_text(self, text):
         """Clean and prepare text for TTS"""
         # Remove extra whitespace
         text = re.sub(r'\s+', ' ', text.strip())
         # Handle common abbreviations
         abbreviations = {
             'Dr.': 'Doctor',
@@ -75,36 +70,28 @@ class LongFormTTS:
             'Ph.D.': 'PhD',
             'M.D.': 'MD',
         }
         for abbr, full in abbreviations.items():
             text = text.replace(abbr, full)
         # Convert numbers to words (enhanced)
         text = re.sub(r'\b(\d{1,4})\b', lambda m: self.number_to_words(int(m.group())), text)
         # Handle years differently (keep as numbers if between 1000-2100)
         text = re.sub(r'\b(1[0-9]{3}|20[0-9]{2}|2100)\b', lambda m: m.group(), text)
         # Clean up problematic characters but keep essential punctuation
         text = re.sub(r'[^\w\s\.,!?;:\-\(\)\'"]', ' ', text)
         text = re.sub(r'\s+', ' ', text)
         return text.strip()
     def number_to_words(self, num):
         """Convert numbers to words"""
         if num == 0:
             return "zero"
         # Keep larger numbers as digits to avoid very long text
         if num > 9999:
             return str(num)
         ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
         teens = ["ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
                 "sixteen", "seventeen", "eighteen", "nineteen"]
         tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
         if num < 10:
             return ones[num]
         elif num < 20:
@@ -120,29 +107,25 @@ class LongFormTTS:
             if remainder > 0:
                 result += " " + self.number_to_words(remainder)
             return result
     def chunk_text(self, text, max_length=400):
         """Split text into manageable chunks"""
         sentences = sent_tokenize(text)
         chunks = []
         current_chunk = ""
         for sentence in sentences:
             sentence = sentence.strip()
             if not sentence:
                 continue
             # If adding this sentence would exceed limit
             if len(current_chunk + " " + sentence) > max_length:
                 # Save current chunk if it exists
                 if current_chunk:
                     chunks.append(current_chunk.strip())
                 # If single sentence is too long, split it
                 if len(sentence) > max_length:
                     words = sentence.split()
                     temp_chunk = ""
                     for word in words:
                         if len(temp_chunk + " " + word) > max_length:
                             if temp_chunk:
@@ -153,95 +136,76 @@ class LongFormTTS:
                                 chunks.append(word)
                         else:
                             temp_chunk = temp_chunk + " " + word if temp_chunk else word
                     current_chunk = temp_chunk
                 else:
                     current_chunk = sentence
             else:
                 current_chunk = current_chunk + " " + sentence if current_chunk else sentence
         # Add the last chunk
         if current_chunk:
             chunks.append(current_chunk.strip())
         return [chunk for chunk in chunks if chunk.strip()]
     def generate_speech_chunk(self, text_chunk):
         """Generate speech for a single chunk"""
         try:
             # Process text through the model
             inputs = self.processor(text=text_chunk, return_tensors="pt").to(self.device)
             with torch.no_grad():
                 speech = self.model.generate_speech(
                     inputs["input_ids"],
                     self.speaker_embeddings,
                     vocoder=self.vocoder
                 )
             # Convert to numpy and move to CPU
             if isinstance(speech, torch.Tensor):
                 speech = speech.cpu().numpy()
             return speech
         except Exception as e:
             print(f"Error generating speech for chunk: {e}")
             print(f"Chunk text: {text_chunk}")
             return None
     def generate_long_speech(self, text, progress_callback=None):
         """Generate speech for long text"""
         # Preprocess text
         processed_text = self.preprocess_text(text)
         print(f"Original length: {len(text)}, Processed length: {len(processed_text)}")
         # Split into chunks
         chunks = self.chunk_text(processed_text)
         print(f"Split into {len(chunks)} chunks")
         if not chunks:
             return None, None
         # Generate speech for each chunk
         audio_segments = []
         sample_rate = 16000  # SpeechT5 uses 16kHz
         for i, chunk in enumerate(chunks):
             if progress_callback:
                 progress_callback(f"Processing chunk {i+1}/{len(chunks)}: {chunk[:40]}{'...' if len(chunk) > 40 else ''}")
             print(f"Processing chunk {i+1}: {chunk}")
             audio_chunk = self.generate_speech_chunk(chunk)
             if audio_chunk is not None and len(audio_chunk) > 0:
                 # Ensure audio is 1D
                 if len(audio_chunk.shape) > 1:
                     audio_chunk = np.mean(audio_chunk, axis=0)
                 audio_segments.append(audio_chunk)
                 # Add pause between chunks (400ms)
                 pause_samples = int(0.4 * sample_rate)
                 silence = np.zeros(pause_samples)
                 audio_segments.append(silence)
             # Small delay to prevent overwhelming the system
             time.sleep(0.1)
         if not audio_segments:
             return None, None
         # Concatenate all segments
         final_audio = np.concatenate(audio_segments)
         # Normalize audio to prevent clipping
         max_val = np.max(np.abs(final_audio))
         if max_val > 0:
             final_audio = final_audio / max_val * 0.95
         return final_audio, sample_rate
 # Global TTS system
 print("🚀 Initializing TTS system...")
 try:
@@ -251,47 +215,40 @@ except Exception as e:
     print(f"❌ TTS initialization failed: {e}")
     tts_system = None
 def text_to_speech_interface(text, progress=gr.Progress()):
     """Main Gradio interface function"""
     if tts_system is None:
         return None, "❌ TTS system is not available. Please check the logs."
     if not text or not text.strip():
         return None, "⚠️ Please enter some text to convert to speech."
     # Text length check
-    if len(text) > 5000:
-        return None, "⚠️ Text is too long. Please keep it under 5,000 characters for optimal performance."
     def progress_callback(message):
         progress(0.5, desc=message)
     try:
         progress(0.1, desc="🔄 Starting text-to-speech conversion...")
         # Generate audio
         audio, sample_rate = tts_system.generate_long_speech(text, progress_callback)
         if audio is None or len(audio) == 0:
             return None, "❌ Failed to generate audio. Please try with different text."
         progress(0.9, desc="💾 Saving audio file...")
         # Save to temporary file
         with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
             sf.write(tmp_file.name, audio, sample_rate)
             audio_path = tmp_file.name
         progress(1.0, desc="✅ Complete!")
         duration = len(audio) / sample_rate
         return audio_path, f"✅ Generated {duration:.1f} seconds of audio successfully!"
     except Exception as e:
         error_msg = f"❌ Error: {str(e)}"
         print(f"TTS Error: {e}")
         return None, error_msg
 # Create Gradio interface
 def create_interface():
     with gr.Blocks(
@@ -323,14 +280,12 @@ def create_interface():
         }
         """
     ) as demo:
         gr.HTML("""
         <div class="main-header">
             <h1>🎤 Long-Form Text-to-Speech Generator</h1>
             <p style="color: #666; font-size: 1.1em;">Transform any text into natural human-like speech using advanced AI</p>
         </div>
         """)
         # System status
         if tts_system:
             gr.HTML("""
@@ -346,26 +301,22 @@ def create_interface():
                 <p>TTS system failed to initialize. Please refresh the page.</p>
             </div>
             """)
         with gr.Row():
             with gr.Column(scale=2):
                 text_input = gr.Textbox(
                     label="📝 Enter Your Text",
-                    placeholder="Type or paste your text here... (Max 5,000 characters)",
                     lines=10,
                     max_lines=20,
                     info="Supports any length text with automatic chunking for optimal quality"
                 )
-                char_count = gr.HTML("<span style='color: #666;'>Character count: 0 / 5,000</span>")
                 generate_btn = gr.Button(
                     "🎯 Generate Speech",
                     variant="primary",
                     size="lg",
                     scale=1
                 )
             with gr.Column(scale=1):
                 gr.HTML("""
                 <div class="feature-box">
@@ -375,38 +326,33 @@ def create_interface():
                         <li>🎭 Natural human voice</li>
                         <li>⚡ Smart text processing</li>
                         <li>🔧 Auto chunking</li>
-                        <li>🆓 Completely free</li>
                         <li>📱 Mobile friendly</li>
                         <li>🎵 High quality audio</li>
                     </ul>
                 </div>
                 """)
         # Status and output
         status_output = gr.Textbox(
             label="📊 Status",
             interactive=False,
             value="Ready to generate speech! Enter some text above."
         )
         audio_output = gr.Audio(
             label="🔊 Generated Speech",
             type="filepath",
             show_download_button=True
         )
         # Character counter
         def update_char_count(text):
             count = len(text) if text else 0
-            color = "#28a745" if count <= 5000 else "#dc3545"
-            return f'<span style="color: {color};">Character count: {count:,} / 5,000</span>'
         text_input.change(
             fn=update_char_count,
             inputs=[text_input],
             outputs=[char_count]
         )
         # Generate button click
         generate_btn.click(
             fn=text_to_speech_interface,
@@ -414,7 +360,6 @@ def create_interface():
             outputs=[audio_output, status_output],
             show_progress=True
         )
         # Example texts
         gr.Examples(
             examples=[
@@ -427,7 +372,6 @@ def create_interface():
             inputs=[text_input],
             label="📚 Try These Examples"
         )
         # Information section
         gr.HTML("""
         <div style="margin-top: 2rem; padding: 1.5rem; background: #f8f9fa; border-radius: 10px; border-left: 4px solid #007bff;">
@@ -438,7 +382,6 @@ def create_interface():
                 <li><strong>Neural Synthesis:</strong> Uses Microsoft's SpeechT5 model for speech generation</li>
                 <li><strong>Audio Assembly:</strong> Combines all chunks with natural pauses</li>
             </ol>
             <h4 style="margin-top: 1rem;">💡 Tips for Best Results</h4>
             <ul style="margin: 0.5rem 0; padding-left: 1.5rem;">
                 <li>Use proper punctuation for natural pauses and intonation</li>
@@ -448,9 +391,9 @@ def create_interface():
             </ul>
         </div>
         """)
     return demo
 # Launch the application
 if __name__ == "__main__":
     demo = create_interface()

 except LookupError:
     nltk.download('punkt')
 class LongFormTTS:
     def __init__(self):
         print("🔄 Loading TTS models...")
         try:
             # Load SpeechT5 - most reliable for HF Spaces
             print("Loading SpeechT5 TTS...")
             self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
             self.model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
             self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
             # Load speaker embeddings
             print("Loading speaker embeddings...")
             embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
             # Use a different speaker embedding for more variety
             self.speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
             self.device = "cuda" if torch.cuda.is_available() else "cpu"
             self.model = self.model.to(self.device)
             self.vocoder = self.vocoder.to(self.device)
             self.speaker_embeddings = self.speaker_embeddings.to(self.device)
             print("✅ SpeechT5 loaded successfully!")
         except Exception as e:
             print(f"❌ Failed to load SpeechT5: {e}")
             raise Exception(f"TTS model loading failed: {e}")
     def preprocess_text(self, text):
         """Clean and prepare text for TTS"""
         # Remove extra whitespace
         text = re.sub(r'\s+', ' ', text.strip())
         # Handle common abbreviations
         abbreviations = {
             'Dr.': 'Doctor',
             'Ph.D.': 'PhD',
             'M.D.': 'MD',
         }
         for abbr, full in abbreviations.items():
             text = text.replace(abbr, full)
         # Convert numbers to words (enhanced)
         text = re.sub(r'\b(\d{1,4})\b', lambda m: self.number_to_words(int(m.group())), text)
         # Handle years differently (keep as numbers if between 1000-2100)
         text = re.sub(r'\b(1[0-9]{3}|20[0-9]{2}|2100)\b', lambda m: m.group(), text)
         # Clean up problematic characters but keep essential punctuation
         text = re.sub(r'[^\w\s\.,!?;:\-\(\)\'"]', ' ', text)
         text = re.sub(r'\s+', ' ', text)
         return text.strip()
     def number_to_words(self, num):
         """Convert numbers to words"""
         if num == 0:
             return "zero"
         # Keep larger numbers as digits to avoid very long text
         if num > 9999:
             return str(num)
         ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
         teens = ["ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
                 "sixteen", "seventeen", "eighteen", "nineteen"]
         tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
         if num < 10:
             return ones[num]
         elif num < 20:
             if remainder > 0:
                 result += " " + self.number_to_words(remainder)
             return result
     def chunk_text(self, text, max_length=400):
         """Split text into manageable chunks"""
         sentences = sent_tokenize(text)
         chunks = []
         current_chunk = ""
         for sentence in sentences:
             sentence = sentence.strip()
             if not sentence:
                 continue
             # If adding this sentence would exceed limit
             if len(current_chunk + " " + sentence) > max_length:
                 # Save current chunk if it exists
                 if current_chunk:
                     chunks.append(current_chunk.strip())
                 # If single sentence is too long, split it
                 if len(sentence) > max_length:
                     words = sentence.split()
                     temp_chunk = ""
                     for word in words:
                         if len(temp_chunk + " " + word) > max_length:
                             if temp_chunk:
                                 chunks.append(word)
                         else:
                             temp_chunk = temp_chunk + " " + word if temp_chunk else word
                     current_chunk = temp_chunk
                 else:
                     current_chunk = sentence
             else:
                 current_chunk = current_chunk + " " + sentence if current_chunk else sentence
         # Add the last chunk
         if current_chunk:
             chunks.append(current_chunk.strip())
         return [chunk for chunk in chunks if chunk.strip()]
     def generate_speech_chunk(self, text_chunk):
         """Generate speech for a single chunk"""
         try:
             # Process text through the model
             inputs = self.processor(text=text_chunk, return_tensors="pt").to(self.device)
             with torch.no_grad():
                 speech = self.model.generate_speech(
                     inputs["input_ids"],
                     self.speaker_embeddings,
                     vocoder=self.vocoder
                 )
             # Convert to numpy and move to CPU
             if isinstance(speech, torch.Tensor):
                 speech = speech.cpu().numpy()
             return speech
         except Exception as e:
             print(f"Error generating speech for chunk: {e}")
             print(f"Chunk text: {text_chunk}")
             return None
     def generate_long_speech(self, text, progress_callback=None):
         """Generate speech for long text"""
         # Preprocess text
         processed_text = self.preprocess_text(text)
         print(f"Original length: {len(text)}, Processed length: {len(processed_text)}")
         # Split into chunks
         chunks = self.chunk_text(processed_text)
         print(f"Split into {len(chunks)} chunks")
         if not chunks:
             return None, None
         # Generate speech for each chunk
         audio_segments = []
         sample_rate = 16000  # SpeechT5 uses 16kHz
         for i, chunk in enumerate(chunks):
             if progress_callback:
                 progress_callback(f"Processing chunk {i+1}/{len(chunks)}: {chunk[:40]}{'...' if len(chunk) > 40 else ''}")
             print(f"Processing chunk {i+1}: {chunk}")
             audio_chunk = self.generate_speech_chunk(chunk)
             if audio_chunk is not None and len(audio_chunk) > 0:
                 # Ensure audio is 1D
                 if len(audio_chunk.shape) > 1:
                     audio_chunk = np.mean(audio_chunk, axis=0)
                 audio_segments.append(audio_chunk)
                 # Add pause between chunks (400ms)
                 pause_samples = int(0.4 * sample_rate)
                 silence = np.zeros(pause_samples)
                 audio_segments.append(silence)
             # Small delay to prevent overwhelming the system
             time.sleep(0.1)
         if not audio_segments:
             return None, None
         # Concatenate all segments
         final_audio = np.concatenate(audio_segments)
         # Normalize audio to prevent clipping
         max_val = np.max(np.abs(final_audio))
         if max_val > 0:
             final_audio = final_audio / max_val * 0.95
         return final_audio, sample_rate
 # Global TTS system
 print("🚀 Initializing TTS system...")
 try:
     print(f"❌ TTS initialization failed: {e}")
     tts_system = None
 def text_to_speech_interface(text, progress=gr.Progress()):
     """Main Gradio interface function"""
     if tts_system is None:
         return None, "❌ TTS system is not available. Please check the logs."
     if not text or not text.strip():
         return None, "⚠️ Please enter some text to convert to speech."
     # Text length check
+    if len(text) > 50000:
+        return None, "⚠️ Text is too long. Please keep it under 50,000 characters for optimal performance."
     def progress_callback(message):
         progress(0.5, desc=message)
     try:
         progress(0.1, desc="🔄 Starting text-to-speech conversion...")
         # Generate audio
         audio, sample_rate = tts_system.generate_long_speech(text, progress_callback)
         if audio is None or len(audio) == 0:
             return None, "❌ Failed to generate audio. Please try with different text."
         progress(0.9, desc="💾 Saving audio file...")
         # Save to temporary file
         with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
             sf.write(tmp_file.name, audio, sample_rate)
             audio_path = tmp_file.name
         progress(1.0, desc="✅ Complete!")
         duration = len(audio) / sample_rate
         return audio_path, f"✅ Generated {duration:.1f} seconds of audio successfully!"
     except Exception as e:
         error_msg = f"❌ Error: {str(e)}"
         print(f"TTS Error: {e}")
         return None, error_msg
 # Create Gradio interface
 def create_interface():
     with gr.Blocks(
         }
         """
     ) as demo:
         gr.HTML("""
         <div class="main-header">
             <h1>🎤 Long-Form Text-to-Speech Generator</h1>
             <p style="color: #666; font-size: 1.1em;">Transform any text into natural human-like speech using advanced AI</p>
         </div>
         """)
         # System status
         if tts_system:
             gr.HTML("""
                 <p>TTS system failed to initialize. Please refresh the page.</p>
             </div>
             """)
         with gr.Row():
             with gr.Column(scale=2):
                 text_input = gr.Textbox(
                     label="📝 Enter Your Text",
+                    placeholder="Type or paste your text here... (Max 50,000 characters)",
                     lines=10,
                     max_lines=20,
                     info="Supports any length text with automatic chunking for optimal quality"
                 )
+                char_count = gr.HTML("<span style='color: #666;'>Character count: 0 / 50,000</span>")
                 generate_btn = gr.Button(
                     "🎯 Generate Speech",
                     variant="primary",
                     size="lg",
                     scale=1
                 )
             with gr.Column(scale=1):
                 gr.HTML("""
                 <div class="feature-box">
                         <li>🎭 Natural human voice</li>
                         <li>⚡ Smart text processing</li>
                         <li>🔧 Auto chunking</li>
+                        <li>_FREE_ Completely free</li>
                         <li>📱 Mobile friendly</li>
                         <li>🎵 High quality audio</li>
                     </ul>
                 </div>
                 """)
         # Status and output
         status_output = gr.Textbox(
             label="📊 Status",
             interactive=False,
             value="Ready to generate speech! Enter some text above."
         )
         audio_output = gr.Audio(
             label="🔊 Generated Speech",
             type="filepath",
             show_download_button=True
         )
         # Character counter
         def update_char_count(text):
             count = len(text) if text else 0
+            color = "#28a745" if count <= 50000 else "#dc3545"
+            return f'<span style="color: {color};">Character count: {count:,} / 50,000</span>'
         text_input.change(
             fn=update_char_count,
             inputs=[text_input],
             outputs=[char_count]
         )
         # Generate button click
         generate_btn.click(
             fn=text_to_speech_interface,
             outputs=[audio_output, status_output],
             show_progress=True
         )
         # Example texts
         gr.Examples(
             examples=[
             inputs=[text_input],
             label="📚 Try These Examples"
         )
         # Information section
         gr.HTML("""
         <div style="margin-top: 2rem; padding: 1.5rem; background: #f8f9fa; border-radius: 10px; border-left: 4px solid #007bff;">
                 <li><strong>Neural Synthesis:</strong> Uses Microsoft's SpeechT5 model for speech generation</li>
                 <li><strong>Audio Assembly:</strong> Combines all chunks with natural pauses</li>
             </ol>
             <h4 style="margin-top: 1rem;">💡 Tips for Best Results</h4>
             <ul style="margin: 0.5rem 0; padding-left: 1.5rem;">
                 <li>Use proper punctuation for natural pauses and intonation</li>
             </ul>
         </div>
         """)
     return demo
 # Launch the application
 if __name__ == "__main__":
     demo = create_interface()