Spaces:

Nick021402
/

Text2speech

Sleeping

App Files Files Community

Nick021402 commited on May 23, 2025

Commit

c1fa46d

verified ·

1 Parent(s): e447a58

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -46

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import gradio as gr
 import numpy as np
 import re
 import soundfile as sf
@@ -8,33 +9,44 @@ import nltk
 from nltk.tokenize import sent_tokenize
 import warnings
 import time
-from TTS.api import TTS
 warnings.filterwarnings("ignore")
 # Download required NLTK data including punkt_tab
 try:
     nltk.data.find('tokenizers/punkt')
-    nltk.data.find('tokenizers/punkt_tab')  # This is the missing one!
 except LookupError:
     nltk.download(['punkt', 'punkt_tab'], quiet=True)
 class LongFormTTS:
     def __init__(self):
-        print("🔄 Loading Coqui TTS models...")
         try:
-            # Load Coqui model
-            self.tts = TTS(model_name="tts_models/en/vctk/vits", progress_bar=False, gpu=False)
-            self.speakers = self.tts.speakers
-            self.sample_rate = 22050
-            print("✅ Coqui TTS loaded successfully!")
-            print(f"Available Speakers: {self.speakers}")
         except Exception as e:
-            print(f"❌ Failed to load Coqui TTS: {e}")
-            self.tts = None
-            self.speakers = []
     def preprocess_text(self, text):
         """Clean and prepare text for TTS"""
@@ -68,15 +80,14 @@ class LongFormTTS:
         return text.strip()
     def number_to_words(self, num):
-        """Convert numbers to words"""
-        if num == 0:
-            return "zero"
-        if num > 9999:
-            return str(num)
         ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
         teens = ["ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
                  "sixteen", "seventeen", "eighteen", "nineteen"]
         tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
         if num < 10:
             return ones[num]
         elif num < 20:
@@ -84,7 +95,7 @@ class LongFormTTS:
         elif num < 100:
             return tens[num // 10] + ("" if num % 10 == 0 else " " + ones[num % 10])
         elif num < 1000:
-            return ones[num // 100] + " hundred" + ("" if num % 100 == 0 else " " + self.number_to_words(num % 100))
         else:
             thousands = num // 1000
             remainder = num % 1000
@@ -93,7 +104,7 @@ class LongFormTTS:
                 result += " " + self.number_to_words(remainder)
             return result
-    def chunk_text(self, text, max_length=200):
         """Split text into manageable chunks"""
         sentences = sent_tokenize(text)
         chunks = []
@@ -126,40 +137,55 @@ class LongFormTTS:
             chunks.append(current_chunk.strip())
         return [chunk for chunk in chunks if chunk.strip()]
-    def generate_speech_chunk(self, text_chunk, speaker):
         """Generate speech for a single chunk"""
         try:
-            return self.tts.tts(text=text_chunk, speaker=speaker)
         except Exception as e:
             print(f"Error generating speech for chunk: {e}")
             return None
-    def generate_long_speech(self, text, speaker=None, progress_callback=None):
         """Generate speech for long text"""
         processed_text = self.preprocess_text(text)
         chunks = self.chunk_text(processed_text)
         print(f"Split into {len(chunks)} chunks")
         audio_segments = []
-        silence = np.zeros(int(0.4 * self.sample_rate), dtype=np.float32)
         for i, chunk in enumerate(chunks):
             if progress_callback:
                 progress_callback(f"Processing chunk {i+1}/{len(chunks)}: {chunk[:40]}{'...' if len(chunk) > 40 else ''}")
             print(f"Processing chunk {i+1}: {chunk}")
-            audio_chunk = self.generate_speech_chunk(chunk, speaker)
-            if audio_chunk is not None:
-                audio_segments.append(np.array(audio_chunk))
                 audio_segments.append(silence)
             time.sleep(0.1)
         if not audio_segments:
             return None, None
         final_audio = np.concatenate(audio_segments)
         max_val = np.max(np.abs(final_audio))
         if max_val > 0:
             final_audio = final_audio / max_val * 0.95
-        return final_audio, self.sample_rate
 # Global TTS system
@@ -172,7 +198,8 @@ except Exception as e:
     tts_system = None
-def text_to_speech_interface(text, speaker="p225", progress=gr.Progress()):
     if tts_system is None:
         return None, "❌ TTS system is not available. Please check the logs."
     if not text or not text.strip():
@@ -186,23 +213,25 @@ def text_to_speech_interface(text, speaker="p225", progress=gr.Progress()):
     try:
         progress(0.1, desc="🔄 Starting text-to-speech conversion...")
         audio, sample_rate = tts_system.generate_long_speech(text, speaker, progress_callback)
-        if audio is None:
             return None, "❌ Failed to generate audio."
         progress(0.9, desc="💾 Saving audio file...")
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
             sf.write(tmp_file.name, audio, sample_rate)
             audio_path = tmp_file.name
         progress(1.0, desc="✅ Complete!")
         duration = len(audio) / sample_rate
         return audio_path, f"✅ Generated {duration:.1f} seconds of audio successfully!"
     except Exception as e:
-        return None, f"❌ Error: {str(e)}"
 # Create Gradio interface
 def create_interface():
     with gr.Blocks(
-        title="🎤 Long-Form Text-to-Speech (Coqui)",
         theme=gr.themes.Soft(),
         css="""
         .main-header {
@@ -218,14 +247,15 @@ def create_interface():
         gr.HTML("""
         <div class="main-header">
             <h1>🎤 Long-Form Text-to-Speech Generator</h1>
-            <p style="color: #666; font-size: 1.1em;">Choose a voice and transform any written text into expressive human-like speech</p>
         </div>
         """)
-        if tts_system and tts_system.speakers:
             gr.HTML("""
             <div style="padding: 1rem; border-radius: 10px; margin: 1rem 0; border-left: 4px solid #28a745; background: #f8f9fa;">
                 <h4>🟢 System Ready</h4>
-                <p>Using <strong>Coqui TTS</strong> with multiple speaker support</p>
             </div>
             """)
         else:
@@ -235,7 +265,6 @@ def create_interface():
                 <p>TTS system failed to initialize. Please refresh the page.</p>
             </div>
             """)
         with gr.Row():
             with gr.Column(scale=2):
                 text_input = gr.Textbox(
@@ -243,12 +272,12 @@ def create_interface():
                     placeholder="Type or paste your text here... (Max 50,000 characters)",
                     lines=10,
                     max_lines=20,
-                    info="Supports any length text with automatic chunking"
                 )
                 char_count = gr.HTML("<span style='color: #666;'>Character count: 0 / 50,000</span>")
                 speaker_dropdown = gr.Dropdown(
-                    choices=tts_system.speakers if tts_system and tts_system.speakers else [],
-                    value=tts_system.speakers[0] if tts_system and tts_system.speakers else None,
                     label="🗣️ Choose Voice"
                 )
                 generate_btn = gr.Button("🎯 Generate Speech", variant="primary", size="lg", scale=1)
@@ -262,6 +291,7 @@ def create_interface():
                         <li>⚡ Smart text processing</li>
                         <li>🔧 Auto chunking</li>
                         <li>🎵 Natural-sounding speech</li>
                     </ul>
                 </div>
                 """)
@@ -284,9 +314,9 @@ def create_interface():
         gr.Examples(
             examples=[
-                ["Hello! Welcome to our advanced text-to-speech system.", "p225"],
-                ["The quick brown fox jumps over the lazy dog.", "p226"],
-                ["Artificial intelligence has revolutionized many aspects of our daily lives.", "p227"],
             ],
             inputs=[text_input, speaker_dropdown],
             label="📚 Try These Examples"

 import gradio as gr
+import torch
 import numpy as np
 import re
 import soundfile as sf
 from nltk.tokenize import sent_tokenize
 import warnings
 import time
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from datasets import load_dataset
 warnings.filterwarnings("ignore")
 # Download required NLTK data including punkt_tab
 try:
     nltk.data.find('tokenizers/punkt')
+    nltk.data.find('tokenizers/punkt_tab')
 except LookupError:
     nltk.download(['punkt', 'punkt_tab'], quiet=True)
 class LongFormTTS:
     def __init__(self):
+        print("🔄 Loading TTS models...")
         try:
+            # Load SpeechT5 - most reliable for HF Spaces
+            print("Loading SpeechT5 TTS...")
+            self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+            self.model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
+            self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+            # Load speaker embeddings dataset
+            print("Loading speaker embeddings...")
+            embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+            # Store multiple speakers
+            self.speakers = {
+                f"Speaker {i+1} ({id})": embeddings_dataset[id]["xvector"]
+                for i, id in enumerate([7306, 7339, 7341, 7345, 7367, 7422])
+            }
+            self.speaker_ids = list(self.speakers.keys())
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+            self.model = self.model.to(self.device)
+            self.vocoder = self.vocoder.to(self.device)
+            print("✅ SpeechT5 loaded successfully!")
         except Exception as e:
+            print(f"❌ Failed to load SpeechT5: {e}")
+            raise Exception(f"TTS model loading failed: {e}")
     def preprocess_text(self, text):
         """Clean and prepare text for TTS"""
         return text.strip()
     def number_to_words(self, num):
         ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
         teens = ["ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
                  "sixteen", "seventeen", "eighteen", "nineteen"]
         tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
+        if num == 0:
+            return "zero"
+        if num > 9999:
+            return str(num)
         if num < 10:
             return ones[num]
         elif num < 20:
         elif num < 100:
             return tens[num // 10] + ("" if num % 10 == 0 else " " + ones[num % 10])
         elif num < 1000:
+            return ones[num // 100] + " hundred" + (" " + self.number_to_words(num % 100)).strip()
         else:
             thousands = num // 1000
             remainder = num % 1000
                 result += " " + self.number_to_words(remainder)
             return result
+    def chunk_text(self, text, max_length=400):
         """Split text into manageable chunks"""
         sentences = sent_tokenize(text)
         chunks = []
             chunks.append(current_chunk.strip())
         return [chunk for chunk in chunks if chunk.strip()]
+    def generate_speech_chunk(self, text_chunk, speaker_embedding):
         """Generate speech for a single chunk"""
         try:
+            inputs = self.processor(text=text_chunk, return_tensors="pt").to(self.device)
+            with torch.no_grad():
+                speech = self.model.generate_speech(
+                    inputs["input_ids"],
+                    torch.tensor(speaker_embedding).unsqueeze(0).to(self.device),
+                    vocoder=self.vocoder
+                )
+            if isinstance(speech, torch.Tensor):
+                speech = speech.cpu().numpy()
+            return speech
         except Exception as e:
             print(f"Error generating speech for chunk: {e}")
+            print(f"Chunk text: {text_chunk}")
             return None
+    def generate_long_speech(self, text, speaker_id=None, progress_callback=None):
         """Generate speech for long text"""
         processed_text = self.preprocess_text(text)
+        print(f"Original length: {len(text)}, Processed length: {len(processed_text)}")
         chunks = self.chunk_text(processed_text)
         print(f"Split into {len(chunks)} chunks")
+        if not chunks:
+            return None, None
+        # Generate speech for each chunk
         audio_segments = []
+        sample_rate = 16000
         for i, chunk in enumerate(chunks):
             if progress_callback:
                 progress_callback(f"Processing chunk {i+1}/{len(chunks)}: {chunk[:40]}{'...' if len(chunk) > 40 else ''}")
             print(f"Processing chunk {i+1}: {chunk}")
+            audio_chunk = self.generate_speech_chunk(chunk, self.speakers[speaker_id or self.speaker_ids[0]])
+            if audio_chunk is not None and len(audio_chunk) > 0:
+                if len(audio_chunk.shape) > 1:
+                    audio_chunk = np.mean(audio_chunk, axis=0)
+                audio_segments.append(audio_chunk)
+                pause_samples = int(0.4 * sample_rate)
+                silence = np.zeros(pause_samples)
                 audio_segments.append(silence)
             time.sleep(0.1)
         if not audio_segments:
             return None, None
         final_audio = np.concatenate(audio_segments)
         max_val = np.max(np.abs(final_audio))
         if max_val > 0:
             final_audio = final_audio / max_val * 0.95
+        return final_audio, sample_rate
 # Global TTS system
     tts_system = None
+def text_to_speech_interface(text, speaker="Speaker 1 (7306)", progress=gr.Progress()):
+    """Main Gradio interface function"""
     if tts_system is None:
         return None, "❌ TTS system is not available. Please check the logs."
     if not text or not text.strip():
     try:
         progress(0.1, desc="🔄 Starting text-to-speech conversion...")
         audio, sample_rate = tts_system.generate_long_speech(text, speaker, progress_callback)
+        if audio is None or len(audio) == 0:
             return None, "❌ Failed to generate audio."
         progress(0.9, desc="💾 Saving audio file...")
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
             sf.write(tmp_file.name, audio, sample_rate)
             audio_path = tmp_file.name
         progress(1.0, desc="✅ Complete!")
         duration = len(audio) / sample_rate
         return audio_path, f"✅ Generated {duration:.1f} seconds of audio successfully!"
     except Exception as e:
+        error_msg = f"❌ Error: {str(e)}"
+        print(f"TTS Error: {e}")
+        return None, error_msg
 # Create Gradio interface
 def create_interface():
     with gr.Blocks(
+        title="🎤 Long-Form Text-to-Speech",
         theme=gr.themes.Soft(),
         css="""
         .main-header {
         gr.HTML("""
         <div class="main-header">
             <h1>🎤 Long-Form Text-to-Speech Generator</h1>
+            <p style="color: #666; font-size: 1.1em;">Transform any text into natural human-like speech using advanced AI</p>
         </div>
         """)
+        # System status
+        if tts_system:
             gr.HTML("""
             <div style="padding: 1rem; border-radius: 10px; margin: 1rem 0; border-left: 4px solid #28a745; background: #f8f9fa;">
                 <h4>🟢 System Ready</h4>
+                <p>Using <strong>Microsoft SpeechT5</strong> - High quality neural text-to-speech</p>
             </div>
             """)
         else:
                 <p>TTS system failed to initialize. Please refresh the page.</p>
             </div>
             """)
         with gr.Row():
             with gr.Column(scale=2):
                 text_input = gr.Textbox(
                     placeholder="Type or paste your text here... (Max 50,000 characters)",
                     lines=10,
                     max_lines=20,
+                    info="Supports any length text with automatic chunking for optimal quality"
                 )
                 char_count = gr.HTML("<span style='color: #666;'>Character count: 0 / 50,000</span>")
                 speaker_dropdown = gr.Dropdown(
+                    choices=tts_system.speaker_ids if tts_system else [],
+                    value=tts_system.speaker_ids[0] if tts_system and tts_system.speaker_ids else None,
                     label="🗣️ Choose Voice"
                 )
                 generate_btn = gr.Button("🎯 Generate Speech", variant="primary", size="lg", scale=1)
                         <li>⚡ Smart text processing</li>
                         <li>🔧 Auto chunking</li>
                         <li>🎵 Natural-sounding speech</li>
+                        <li>🔊 MP3 audio output</li>
                     </ul>
                 </div>
                 """)
         gr.Examples(
             examples=[
+                ["Hello! Welcome to our advanced text-to-speech system.", "Speaker 1 (7306)"],
+                ["The quick brown fox jumps over the lazy dog.", "Speaker 2 (7339)"],
+                ["Artificial intelligence has revolutionized many aspects of our lives.", "Speaker 3 (7341)"],
             ],
             inputs=[text_input, speaker_dropdown],
             label="📚 Try These Examples"