Spaces:

Nick021402
/

Text2speech

Sleeping

App Files Files Community

Nick021402 commited on May 23, 2025

Commit

c4f9f48

verified ·

1 Parent(s): adc6379

Update app.py

Browse files

Files changed (1) hide show

app.py +178 -55

app.py CHANGED Viewed

@@ -2,17 +2,16 @@ import gradio as gr
 import torch
 import numpy as np
 import re
-from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
-from datasets import load_dataset
 import soundfile as sf
 import io
 import tempfile
 import os
 from pydub import AudioSegment
-from pydub.silence import split_on_silence
 import nltk
 from nltk.tokenize import sent_tokenize
 import warnings
 warnings.filterwarnings("ignore")
 # Download required NLTK data
@@ -25,16 +24,46 @@ class LongFormTTS:
     def __init__(self):
         print("Loading TTS models...")
-        # Load SpeechT5 models (free and high quality)
-        self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
-        self.model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
-        self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
-        # Load speaker embeddings dataset
-        embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
-        self.speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
-        print("Models loaded successfully!")
     def preprocess_text(self, text):
         """Clean and prepare text for TTS"""
@@ -52,6 +81,12 @@ class LongFormTTS:
             'vs.': 'versus',
             'e.g.': 'for example',
             'i.e.': 'that is',
         }
         for abbr, full in abbreviations.items():
@@ -60,6 +95,9 @@ class LongFormTTS:
         # Handle numbers (basic)
         text = re.sub(r'\b(\d+)\b', lambda m: self.number_to_words(int(m.group())), text)
         return text
     def number_to_words(self, num):
@@ -67,6 +105,9 @@ class LongFormTTS:
         if num == 0:
             return "zero"
         ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
         teens = ["ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
                 "sixteen", "seventeen", "eighteen", "nineteen"]
@@ -81,9 +122,9 @@ class LongFormTTS:
         elif num < 1000:
             return ones[num // 100] + " hundred" + ("" if num % 100 == 0 else " " + self.number_to_words(num % 100))
         else:
-            return str(num)  # Fallback for larger numbers
-    def chunk_text(self, text, max_length=500):
         """Split text into manageable chunks while preserving sentence boundaries"""
         sentences = sent_tokenize(text)
         chunks = []
@@ -109,7 +150,7 @@ class LongFormTTS:
                                         chunks.append(temp_chunk.strip())
                                         temp_chunk = word
                                     else:
-                                        chunks.append(word)  # Single word longer than limit
                                 else:
                                     temp_chunk += " " + word if temp_chunk else word
                             if temp_chunk:
@@ -134,16 +175,33 @@ class LongFormTTS:
     def generate_speech_chunk(self, text_chunk):
         """Generate speech for a single text chunk"""
         try:
-            inputs = self.processor(text=text_chunk, return_tensors="pt")
-            speech = self.model.generate_speech(
-                inputs["input_ids"],
-                self.speaker_embeddings,
-                vocoder=self.vocoder
-            )
-            return speech.numpy()
         except Exception as e:
             print(f"Error generating speech for chunk: {e}")
-            return np.array([])
     def generate_long_speech(self, text, progress_callback=None):
         """Generate speech for long text by processing in chunks"""
@@ -151,45 +209,69 @@ class LongFormTTS:
         text = self.preprocess_text(text)
         # Split into chunks
-        chunks = self.chunk_text(text)
         print(f"Split text into {len(chunks)} chunks")
         if not chunks:
-            return np.array([]), 16000
         # Generate speech for each chunk
         audio_segments = []
         total_chunks = len(chunks)
         for i, chunk in enumerate(chunks):
             if progress_callback:
-                progress_callback(f"Processing chunk {i+1}/{total_chunks}: {chunk[:50]}...")
-            speech_chunk = self.generate_speech_chunk(chunk)
-            if len(speech_chunk) > 0:
-                audio_segments.append(speech_chunk)
-            # Add small pause between chunks (200ms of silence)
-            pause_duration = int(0.2 * 16000)  # 200ms at 16kHz
-            silence = np.zeros(pause_duration)
-            audio_segments.append(silence)
         if not audio_segments:
-            return np.array([]), 16000
         # Concatenate all audio segments
         final_audio = np.concatenate(audio_segments)
-        return final_audio, 16000
 # Initialize TTS system
-tts_system = LongFormTTS()
 def text_to_speech_interface(text, progress=gr.Progress()):
     """Main interface function for Gradio"""
     if not text.strip():
         return None, "Please enter some text to convert to speech."
     def progress_callback(message):
         progress(0.5, desc=message)
@@ -198,8 +280,8 @@ def text_to_speech_interface(text, progress=gr.Progress()):
         audio, sample_rate = tts_system.generate_long_speech(text, progress_callback)
-        if len(audio) == 0:
-            return None, "Failed to generate audio. Please try again."
         progress(0.9, desc="Finalizing audio...")
@@ -210,7 +292,8 @@ def text_to_speech_interface(text, progress=gr.Progress()):
         progress(1.0, desc="Complete!")
-        return audio_path, f"✅ Successfully generated {len(audio)/sample_rate:.1f} seconds of audio!"
     except Exception as e:
         error_msg = f"❌ Error: {str(e)}"
@@ -234,6 +317,12 @@ def create_interface():
             border-radius: 10px;
             margin: 1rem 0;
         }
         """
     ) as demo:
@@ -244,15 +333,35 @@ def create_interface():
         </div>
         """)
         with gr.Row():
             with gr.Column(scale=2):
                 text_input = gr.Textbox(
-                    label="📝 Enter your text",
-                    placeholder="Type or paste any text here... No length limit!",
-                    lines=10,
-                    max_lines=20
                 )
                 generate_btn = gr.Button(
                     "🎯 Generate Speech",
                     variant="primary",
@@ -264,11 +373,12 @@ def create_interface():
                 <div class="feature-box">
                     <h3>✨ Features</h3>
                     <ul>
-                        <li>🚀 Unlimited text length</li>
-                        <li>🤖 Human-like voice quality</li>
                         <li>⚡ Smart text chunking</li>
-                        <li>🆓 Completely free to use</li>
-                        <li>🔧 Automatic text preprocessing</li>
                     </ul>
                 </div>
                 """)
@@ -284,6 +394,18 @@ def create_interface():
             type="filepath"
         )
         # Event handlers
         generate_btn.click(
             fn=text_to_speech_interface,
@@ -294,9 +416,10 @@ def create_interface():
         # Example texts
         gr.Examples(
             examples=[
-                ["Hello! This is a test of the long-form text-to-speech system. It can handle texts of any length by intelligently splitting them into smaller chunks while maintaining natural flow and pronunciation."],
-                ["The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet and is commonly used for testing text-to-speech systems."],
-                ["In a hole in the ground there lived a hobbit. Not a nasty, dirty, wet hole filled with the ends of worms and an oozy smell, nor yet a dry, bare, sandy hole with nothing in it to sit down on or to eat: it was a hobbit-hole, and that means comfort."]
             ],
             inputs=[text_input]
         )
@@ -305,12 +428,12 @@ def create_interface():
         <div style="margin-top: 2rem; padding: 1rem; background: #f0f0f0; border-radius: 5px;">
             <h4>🔧 How it works:</h4>
             <ol>
-                <li><strong>Text Preprocessing:</strong> Cleans and normalizes your input text</li>
-                <li><strong>Smart Chunking:</strong> Splits long text at sentence boundaries</li>
-                <li><strong>Speech Generation:</strong> Uses Microsoft's SpeechT5 model for each chunk</li>
-                <li><strong>Audio Merging:</strong> Combines all chunks with natural pauses</li>
             </ol>
-            <p><em>💡 Tip: The system works best with well-formatted text with proper punctuation!</em></p>
         </div>
         """)

 import torch
 import numpy as np
 import re
+from transformers import pipeline
 import soundfile as sf
 import io
 import tempfile
 import os
 from pydub import AudioSegment
 import nltk
 from nltk.tokenize import sent_tokenize
 import warnings
+import time
 warnings.filterwarnings("ignore")
 # Download required NLTK data
     def __init__(self):
         print("Loading TTS models...")
+        # Try multiple TTS approaches for better compatibility
+        self.tts_pipeline = None
+        self.backup_tts = None
+        # Primary: Try Bark (works well on HF Spaces)
+        try:
+            print("Loading Bark TTS...")
+            self.tts_pipeline = pipeline(
+                "text-to-speech",
+                model="suno/bark-small",
+                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            )
+            self.tts_method = "bark"
+            print("✅ Bark TTS loaded successfully!")
+        except Exception as e:
+            print(f"❌ Bark TTS failed: {e}")
+            # Backup: Try Parler TTS
+            try:
+                print("Loading Parler TTS...")
+                self.tts_pipeline = pipeline(
+                    "text-to-speech",
+                    model="parler-tts/parler_tts_mini_v0.1",
+                    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+                )
+                self.tts_method = "parler"
+                print("✅ Parler TTS loaded successfully!")
+            except Exception as e:
+                print(f"❌ Parler TTS failed: {e}")
+                # Final backup: Try FastSpeech2
+                try:
+                    print("Loading FastSpeech2...")
+                    from TTS.api import TTS
+                    self.backup_tts = TTS(model_name="tts_models/en/ljspeech/fastspeech2")
+                    self.tts_method = "fastspeech2"
+                    print("✅ FastSpeech2 loaded successfully!")
+                except Exception as e:
+                    print(f"❌ All TTS models failed: {e}")
+                    raise Exception("No TTS model could be loaded. Please check the requirements.")
     def preprocess_text(self, text):
         """Clean and prepare text for TTS"""
             'vs.': 'versus',
             'e.g.': 'for example',
             'i.e.': 'that is',
+            'St.': 'Street',
+            'Ave.': 'Avenue',
+            'Blvd.': 'Boulevard',
+            'Inc.': 'Incorporated',
+            'Corp.': 'Corporation',
+            'Ltd.': 'Limited',
         }
         for abbr, full in abbreviations.items():
         # Handle numbers (basic)
         text = re.sub(r'\b(\d+)\b', lambda m: self.number_to_words(int(m.group())), text)
+        # Clean up any problematic characters
+        text = re.sub(r'[^\w\s\.,!?;:\-\(\)]', '', text)
         return text
     def number_to_words(self, num):
         if num == 0:
             return "zero"
+        if num > 9999:
+            return str(num)  # Keep large numbers as digits
         ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
         teens = ["ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
                 "sixteen", "seventeen", "eighteen", "nineteen"]
         elif num < 1000:
             return ones[num // 100] + " hundred" + ("" if num % 100 == 0 else " " + self.number_to_words(num % 100))
         else:
+            return str(num)
+    def chunk_text(self, text, max_length=200):
         """Split text into manageable chunks while preserving sentence boundaries"""
         sentences = sent_tokenize(text)
         chunks = []
                                         chunks.append(temp_chunk.strip())
                                         temp_chunk = word
                                     else:
+                                        chunks.append(word)
                                 else:
                                     temp_chunk += " " + word if temp_chunk else word
                             if temp_chunk:
     def generate_speech_chunk(self, text_chunk):
         """Generate speech for a single text chunk"""
         try:
+            if self.tts_method == "bark":
+                # Bark TTS
+                speech = self.tts_pipeline(text_chunk)
+                audio = speech["audio"]
+                sampling_rate = speech["sampling_rate"]
+                return audio, sampling_rate
+            elif self.tts_method == "parler":
+                # Parler TTS
+                speech = self.tts_pipeline(text_chunk)
+                audio = speech["audio"]
+                sampling_rate = speech["sampling_rate"]
+                return audio, sampling_rate
+            elif self.tts_method == "fastspeech2":
+                # FastSpeech2 via TTS library
+                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+                    self.backup_tts.tts_to_file(text=text_chunk, file_path=tmp_file.name)
+                    audio, sr = sf.read(tmp_file.name)
+                    os.unlink(tmp_file.name)
+                    return audio, sr
+            else:
+                raise Exception("No TTS method available")
         except Exception as e:
             print(f"Error generating speech for chunk: {e}")
+            return None, None
     def generate_long_speech(self, text, progress_callback=None):
         """Generate speech for long text by processing in chunks"""
         text = self.preprocess_text(text)
         # Split into chunks
+        chunks = self.chunk_text(text, max_length=150)  # Smaller chunks for better compatibility
         print(f"Split text into {len(chunks)} chunks")
         if not chunks:
+            return None, None
         # Generate speech for each chunk
         audio_segments = []
+        sampling_rate = None
         total_chunks = len(chunks)
         for i, chunk in enumerate(chunks):
             if progress_callback:
+                progress_callback(f"Processing chunk {i+1}/{total_chunks}: {chunk[:30]}...")
+            audio_chunk, sr = self.generate_speech_chunk(chunk)
+            if audio_chunk is not None and len(audio_chunk) > 0:
+                if sampling_rate is None:
+                    sampling_rate = sr
+                # Ensure audio is 1D
+                if len(audio_chunk.shape) > 1:
+                    audio_chunk = audio_chunk.mean(axis=1)
+                audio_segments.append(audio_chunk)
+                # Add small pause between chunks (300ms of silence)
+                pause_duration = int(0.3 * sampling_rate)
+                silence = np.zeros(pause_duration)
+                audio_segments.append(silence)
+            # Small delay to prevent overwhelming the system
+            time.sleep(0.1)
         if not audio_segments:
+            return None, None
         # Concatenate all audio segments
         final_audio = np.concatenate(audio_segments)
+        return final_audio, sampling_rate
 # Initialize TTS system
+print("Initializing TTS system...")
+try:
+    tts_system = LongFormTTS()
+    print("✅ TTS system initialized successfully!")
+except Exception as e:
+    print(f"❌ Failed to initialize TTS system: {e}")
+    tts_system = None
 def text_to_speech_interface(text, progress=gr.Progress()):
     """Main interface function for Gradio"""
+    if tts_system is None:
+        return None, "❌ TTS system not available. Please check the logs."
     if not text.strip():
         return None, "Please enter some text to convert to speech."
+    if len(text) > 10000:
+        return None, "Text is too long. Please keep it under 10,000 characters for optimal performance."
     def progress_callback(message):
         progress(0.5, desc=message)
         audio, sample_rate = tts_system.generate_long_speech(text, progress_callback)
+        if audio is None or len(audio) == 0:
+            return None, "Failed to generate audio. Please try with shorter text or check your input."
         progress(0.9, desc="Finalizing audio...")
         progress(1.0, desc="Complete!")
+        duration = len(audio) / sample_rate
+        return audio_path, f"✅ Successfully generated {duration:.1f} seconds of audio using {tts_system.tts_method.upper()}!"
     except Exception as e:
         error_msg = f"❌ Error: {str(e)}"
             border-radius: 10px;
             margin: 1rem 0;
         }
+        .status-box {
+            background: #f8f9fa;
+            border-left: 4px solid #007bff;
+            padding: 1rem;
+            margin: 1rem 0;
+        }
         """
     ) as demo:
         </div>
         """)
+        # Show TTS system status
+        if tts_system is not None:
+            status_html = f"""
+            <div class="status-box">
+                <h4>🟢 System Status: Ready</h4>
+                <p>Using <strong>{tts_system.tts_method.upper()}</strong> TTS engine</p>
+            </div>
+            """
+        else:
+            status_html = """
+            <div class="status-box" style="border-left-color: #dc3545;">
+                <h4>🔴 System Status: Error</h4>
+                <p>TTS system failed to initialize. Please check the logs.</p>
+            </div>
+            """
+        gr.HTML(status_html)
         with gr.Row():
             with gr.Column(scale=2):
                 text_input = gr.Textbox(
+                    label="📝 Enter your text (max 10,000 characters)",
+                    placeholder="Type or paste your text here...",
+                    lines=8,
+                    max_lines=15
                 )
+                char_count = gr.HTML("Character count: 0")
                 generate_btn = gr.Button(
                     "🎯 Generate Speech",
                     variant="primary",
                 <div class="feature-box">
                     <h3>✨ Features</h3>
                     <ul>
+                        <li>🚀 Long text support</li>
+                        <li>🤖 Multiple TTS engines</li>
                         <li>⚡ Smart text chunking</li>
+                        <li>🆓 Completely free</li>
+                        <li>🔧 Auto preprocessing</li>
+                        <li>📱 Mobile friendly</li>
                     </ul>
                 </div>
                 """)
             type="filepath"
         )
+        # Character counter
+        def update_char_count(text):
+            count = len(text)
+            color = "green" if count <= 10000 else "red"
+            return f'<span style="color: {color};">Character count: {count}/10,000</span>'
+        text_input.change(
+            fn=update_char_count,
+            inputs=[text_input],
+            outputs=[char_count]
+        )
         # Event handlers
         generate_btn.click(
             fn=text_to_speech_interface,
         # Example texts
         gr.Examples(
             examples=[
+                ["Hello! This is a test of the text-to-speech system. It can handle longer texts by splitting them into smaller chunks."],
+                ["The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet."],
+                ["In a hole in the ground there lived a hobbit. Not a nasty, dirty, wet hole, but a comfortable hobbit-hole."],
+                ["Welcome to our advanced text-to-speech generator. This system uses state-of-the-art AI models to convert your text into natural-sounding speech. You can input texts of various lengths, and the system will intelligently process them to create high-quality audio output."]
             ],
             inputs=[text_input]
         )
         <div style="margin-top: 2rem; padding: 1rem; background: #f0f0f0; border-radius: 5px;">
             <h4>🔧 How it works:</h4>
             <ol>
+                <li><strong>Multiple Engines:</strong> Tries Bark, Parler, or FastSpeech2 TTS models</li>
+                <li><strong>Smart Chunking:</strong> Splits long text at natural boundaries</li>
+                <li><strong>Audio Processing:</strong> Combines chunks with natural pauses</li>
+                <li><strong>Quality Output:</strong> Generates high-quality WAV audio</li>
             </ol>
+            <p><em>💡 Tip: For best results, use well-formatted text with proper punctuation!</em></p>
         </div>
         """)