Spaces:

Nick021402
/

Text2speech

Sleeping

App Files Files Community

Nick021402 commited on May 23, 2025

Commit

a7ffad5

verified ·

1 Parent(s): c4f9f48

Update app.py

Browse files

Files changed (1) hide show

app.py +210 -198

app.py CHANGED Viewed

@@ -2,16 +2,16 @@ import gradio as gr
 import torch
 import numpy as np
 import re
-from transformers import pipeline
 import soundfile as sf
-import io
 import tempfile
 import os
-from pydub import AudioSegment
 import nltk
 from nltk.tokenize import sent_tokenize
 import warnings
 import time
 warnings.filterwarnings("ignore")
 # Download required NLTK data
@@ -22,48 +22,31 @@ except LookupError:
 class LongFormTTS:
     def __init__(self):
-        print("Loading TTS models...")
-        # Try multiple TTS approaches for better compatibility
-        self.tts_pipeline = None
-        self.backup_tts = None
-        # Primary: Try Bark (works well on HF Spaces)
         try:
-            print("Loading Bark TTS...")
-            self.tts_pipeline = pipeline(
-                "text-to-speech",
-                model="suno/bark-small",
-                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-            )
-            self.tts_method = "bark"
-            print("✅ Bark TTS loaded successfully!")
-        except Exception as e:
-            print(f"❌ Bark TTS failed: {e}")
-            # Backup: Try Parler TTS
-            try:
-                print("Loading Parler TTS...")
-                self.tts_pipeline = pipeline(
-                    "text-to-speech",
-                    model="parler-tts/parler_tts_mini_v0.1",
-                    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-                )
-                self.tts_method = "parler"
-                print("✅ Parler TTS loaded successfully!")
-            except Exception as e:
-                print(f"❌ Parler TTS failed: {e}")
-                # Final backup: Try FastSpeech2
-                try:
-                    print("Loading FastSpeech2...")
-                    from TTS.api import TTS
-                    self.backup_tts = TTS(model_name="tts_models/en/ljspeech/fastspeech2")
-                    self.tts_method = "fastspeech2"
-                    print("✅ FastSpeech2 loaded successfully!")
-                except Exception as e:
-                    print(f"❌ All TTS models failed: {e}")
-                    raise Exception("No TTS model could be loaded. Please check the requirements.")
     def preprocess_text(self, text):
         """Clean and prepare text for TTS"""
@@ -73,7 +56,7 @@ class LongFormTTS:
         # Handle common abbreviations
         abbreviations = {
             'Dr.': 'Doctor',
-            'Mr.': 'Mister',
             'Mrs.': 'Missus',
             'Ms.': 'Miss',
             'Prof.': 'Professor',
@@ -87,26 +70,35 @@ class LongFormTTS:
             'Inc.': 'Incorporated',
             'Corp.': 'Corporation',
             'Ltd.': 'Limited',
         }
         for abbr, full in abbreviations.items():
             text = text.replace(abbr, full)
-        # Handle numbers (basic)
-        text = re.sub(r'\b(\d+)\b', lambda m: self.number_to_words(int(m.group())), text)
-        # Clean up any problematic characters
-        text = re.sub(r'[^\w\s\.,!?;:\-\(\)]', '', text)
-        return text
     def number_to_words(self, num):
-        """Convert numbers to words (basic implementation)"""
         if num == 0:
             return "zero"
         if num > 9999:
-            return str(num)  # Keep large numbers as digits
         ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
         teens = ["ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
@@ -122,123 +114,116 @@ class LongFormTTS:
         elif num < 1000:
             return ones[num // 100] + " hundred" + ("" if num % 100 == 0 else " " + self.number_to_words(num % 100))
         else:
-            return str(num)
-    def chunk_text(self, text, max_length=200):
-        """Split text into manageable chunks while preserving sentence boundaries"""
         sentences = sent_tokenize(text)
         chunks = []
         current_chunk = ""
         for sentence in sentences:
-            # If single sentence is too long, split by clauses
-            if len(sentence) > max_length:
-                clauses = re.split(r'[,;:]', sentence)
-                for clause in clauses:
-                    clause = clause.strip()
-                    if len(current_chunk + clause) > max_length:
-                        if current_chunk:
-                            chunks.append(current_chunk.strip())
-                            current_chunk = clause
-                        else:
-                            # Even single clause is too long, force split
-                            words = clause.split()
-                            temp_chunk = ""
-                            for word in words:
-                                if len(temp_chunk + word) > max_length:
-                                    if temp_chunk:
-                                        chunks.append(temp_chunk.strip())
-                                        temp_chunk = word
-                                    else:
-                                        chunks.append(word)
-                                else:
-                                    temp_chunk += " " + word if temp_chunk else word
                             if temp_chunk:
-                                current_chunk = temp_chunk
-                    else:
-                        current_chunk += " " + clause if current_chunk else clause
-            else:
-                if len(current_chunk + sentence) > max_length:
-                    if current_chunk:
-                        chunks.append(current_chunk.strip())
-                        current_chunk = sentence
-                    else:
-                        chunks.append(sentence)
                 else:
-                    current_chunk += " " + sentence if current_chunk else sentence
         if current_chunk:
             chunks.append(current_chunk.strip())
         return [chunk for chunk in chunks if chunk.strip()]
     def generate_speech_chunk(self, text_chunk):
-        """Generate speech for a single text chunk"""
         try:
-            if self.tts_method == "bark":
-                # Bark TTS
-                speech = self.tts_pipeline(text_chunk)
-                audio = speech["audio"]
-                sampling_rate = speech["sampling_rate"]
-                return audio, sampling_rate
-            elif self.tts_method == "parler":
-                # Parler TTS
-                speech = self.tts_pipeline(text_chunk)
-                audio = speech["audio"]
-                sampling_rate = speech["sampling_rate"]
-                return audio, sampling_rate
-            elif self.tts_method == "fastspeech2":
-                # FastSpeech2 via TTS library
-                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
-                    self.backup_tts.tts_to_file(text=text_chunk, file_path=tmp_file.name)
-                    audio, sr = sf.read(tmp_file.name)
-                    os.unlink(tmp_file.name)
-                    return audio, sr
-            else:
-                raise Exception("No TTS method available")
         except Exception as e:
             print(f"Error generating speech for chunk: {e}")
-            return None, None
     def generate_long_speech(self, text, progress_callback=None):
-        """Generate speech for long text by processing in chunks"""
         # Preprocess text
-        text = self.preprocess_text(text)
         # Split into chunks
-        chunks = self.chunk_text(text, max_length=150)  # Smaller chunks for better compatibility
-        print(f"Split text into {len(chunks)} chunks")
         if not chunks:
             return None, None
         # Generate speech for each chunk
         audio_segments = []
-        sampling_rate = None
-        total_chunks = len(chunks)
         for i, chunk in enumerate(chunks):
             if progress_callback:
-                progress_callback(f"Processing chunk {i+1}/{total_chunks}: {chunk[:30]}...")
-            audio_chunk, sr = self.generate_speech_chunk(chunk)
             if audio_chunk is not None and len(audio_chunk) > 0:
-                if sampling_rate is None:
-                    sampling_rate = sr
                 # Ensure audio is 1D
                 if len(audio_chunk.shape) > 1:
-                    audio_chunk = audio_chunk.mean(axis=1)
                 audio_segments.append(audio_chunk)
-                # Add small pause between chunks (300ms of silence)
-                pause_duration = int(0.3 * sampling_rate)
-                silence = np.zeros(pause_duration)
                 audio_segments.append(silence)
             # Small delay to prevent overwhelming the system
@@ -247,81 +232,94 @@ class LongFormTTS:
         if not audio_segments:
             return None, None
-        # Concatenate all audio segments
         final_audio = np.concatenate(audio_segments)
-        return final_audio, sampling_rate
-# Initialize TTS system
-print("Initializing TTS system...")
 try:
     tts_system = LongFormTTS()
-    print("✅ TTS system initialized successfully!")
 except Exception as e:
-    print(f"❌ Failed to initialize TTS system: {e}")
     tts_system = None
 def text_to_speech_interface(text, progress=gr.Progress()):
-    """Main interface function for Gradio"""
     if tts_system is None:
-        return None, "❌ TTS system not available. Please check the logs."
-    if not text.strip():
-        return None, "Please enter some text to convert to speech."
-    if len(text) > 10000:
-        return None, "Text is too long. Please keep it under 10,000 characters for optimal performance."
     def progress_callback(message):
         progress(0.5, desc=message)
     try:
-        progress(0.1, desc="Starting text-to-speech conversion...")
         audio, sample_rate = tts_system.generate_long_speech(text, progress_callback)
         if audio is None or len(audio) == 0:
-            return None, "Failed to generate audio. Please try with shorter text or check your input."
-        progress(0.9, desc="Finalizing audio...")
         # Save to temporary file
         with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
             sf.write(tmp_file.name, audio, sample_rate)
             audio_path = tmp_file.name
-        progress(1.0, desc="Complete!")
         duration = len(audio) / sample_rate
-        return audio_path, f"✅ Successfully generated {duration:.1f} seconds of audio using {tts_system.tts_method.upper()}!"
     except Exception as e:
         error_msg = f"❌ Error: {str(e)}"
-        print(error_msg)
         return None, error_msg
 # Create Gradio interface
 def create_interface():
     with gr.Blocks(
-        title="🎤 Long-Form Text-to-Speech Generator",
         theme=gr.themes.Soft(),
         css="""
         .main-header {
             text-align: center;
             margin-bottom: 2rem;
         }
         .feature-box {
             background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
             color: white;
-            padding: 1rem;
-            border-radius: 10px;
             margin: 1rem 0;
         }
         .status-box {
-            background: #f8f9fa;
-            border-left: 4px solid #007bff;
             padding: 1rem;
             margin: 1rem 0;
         }
         """
     ) as demo:
@@ -329,76 +327,79 @@ def create_interface():
         gr.HTML("""
         <div class="main-header">
             <h1>🎤 Long-Form Text-to-Speech Generator</h1>
-            <p>Convert any length of text to natural human-like speech using free AI models</p>
         </div>
         """)
-        # Show TTS system status
-        if tts_system is not None:
-            status_html = f"""
             <div class="status-box">
-                <h4>🟢 System Status: Ready</h4>
-                <p>Using <strong>{tts_system.tts_method.upper()}</strong> TTS engine</p>
             </div>
-            """
         else:
-            status_html = """
-            <div class="status-box" style="border-left-color: #dc3545;">
-                <h4>🔴 System Status: Error</h4>
-                <p>TTS system failed to initialize. Please check the logs.</p>
             </div>
-            """
-        gr.HTML(status_html)
         with gr.Row():
             with gr.Column(scale=2):
                 text_input = gr.Textbox(
-                    label="📝 Enter your text (max 10,000 characters)",
-                    placeholder="Type or paste your text here...",
-                    lines=8,
-                    max_lines=15
                 )
-                char_count = gr.HTML("Character count: 0")
                 generate_btn = gr.Button(
                     "🎯 Generate Speech",
                     variant="primary",
-                    size="lg"
                 )
             with gr.Column(scale=1):
                 gr.HTML("""
                 <div class="feature-box">
-                    <h3>✨ Features</h3>
-                    <ul>
-                        <li>🚀 Long text support</li>
-                        <li>🤖 Multiple TTS engines</li>
-                        <li>⚡ Smart text chunking</li>
                         <li>🆓 Completely free</li>
-                        <li>🔧 Auto preprocessing</li>
                         <li>📱 Mobile friendly</li>
                     </ul>
                 </div>
                 """)
-        status_text = gr.Textbox(
             label="📊 Status",
             interactive=False,
-            value="Ready to generate speech!"
         )
         audio_output = gr.Audio(
             label="🔊 Generated Speech",
-            type="filepath"
         )
         # Character counter
         def update_char_count(text):
-            count = len(text)
-            color = "green" if count <= 10000 else "red"
-            return f'<span style="color: {color};">Character count: {count}/10,000</span>'
         text_input.change(
             fn=update_char_count,
@@ -406,40 +407,51 @@ def create_interface():
             outputs=[char_count]
         )
-        # Event handlers
         generate_btn.click(
             fn=text_to_speech_interface,
             inputs=[text_input],
-            outputs=[audio_output, status_text]
         )
         # Example texts
         gr.Examples(
             examples=[
-                ["Hello! This is a test of the text-to-speech system. It can handle longer texts by splitting them into smaller chunks."],
-                ["The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet."],
-                ["In a hole in the ground there lived a hobbit. Not a nasty, dirty, wet hole, but a comfortable hobbit-hole."],
-                ["Welcome to our advanced text-to-speech generator. This system uses state-of-the-art AI models to convert your text into natural-sounding speech. You can input texts of various lengths, and the system will intelligently process them to create high-quality audio output."]
             ],
-            inputs=[text_input]
         )
         gr.HTML("""
-        <div style="margin-top: 2rem; padding: 1rem; background: #f0f0f0; border-radius: 5px;">
-            <h4>🔧 How it works:</h4>
-            <ol>
-                <li><strong>Multiple Engines:</strong> Tries Bark, Parler, or FastSpeech2 TTS models</li>
-                <li><strong>Smart Chunking:</strong> Splits long text at natural boundaries</li>
-                <li><strong>Audio Processing:</strong> Combines chunks with natural pauses</li>
-                <li><strong>Quality Output:</strong> Generates high-quality WAV audio</li>
             </ol>
-            <p><em>💡 Tip: For best results, use well-formatted text with proper punctuation!</em></p>
         </div>
         """)
     return demo
-# Launch the app
 if __name__ == "__main__":
     demo = create_interface()
     demo.launch(

 import torch
 import numpy as np
 import re
 import soundfile as sf
 import tempfile
 import os
 import nltk
 from nltk.tokenize import sent_tokenize
 import warnings
 import time
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from datasets import load_dataset
 warnings.filterwarnings("ignore")
 # Download required NLTK data
 class LongFormTTS:
     def __init__(self):
+        print("🔄 Loading TTS models...")
         try:
+            # Load SpeechT5 - most reliable for HF Spaces
+            print("Loading SpeechT5 TTS...")
+            self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+            self.model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
+            self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+            # Load speaker embeddings
+            print("Loading speaker embeddings...")
+            embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+            # Use a different speaker embedding for more variety
+            self.speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+            self.model = self.model.to(self.device)
+            self.vocoder = self.vocoder.to(self.device)
+            self.speaker_embeddings = self.speaker_embeddings.to(self.device)
+            print("✅ SpeechT5 loaded successfully!")
+        except Exception as e:
+            print(f"❌ Failed to load SpeechT5: {e}")
+            raise Exception(f"TTS model loading failed: {e}")
     def preprocess_text(self, text):
         """Clean and prepare text for TTS"""
         # Handle common abbreviations
         abbreviations = {
             'Dr.': 'Doctor',
+            'Mr.': 'Mister',
             'Mrs.': 'Missus',
             'Ms.': 'Miss',
             'Prof.': 'Professor',
             'Inc.': 'Incorporated',
             'Corp.': 'Corporation',
             'Ltd.': 'Limited',
+            'U.S.': 'United States',
+            'U.K.': 'United Kingdom',
+            'Ph.D.': 'PhD',
+            'M.D.': 'MD',
         }
         for abbr, full in abbreviations.items():
             text = text.replace(abbr, full)
+        # Convert numbers to words (enhanced)
+        text = re.sub(r'\b(\d{1,4})\b', lambda m: self.number_to_words(int(m.group())), text)
+        # Handle years differently (keep as numbers if between 1000-2100)
+        text = re.sub(r'\b(1[0-9]{3}|20[0-9]{2}|2100)\b', lambda m: m.group(), text)
+        # Clean up problematic characters but keep essential punctuation
+        text = re.sub(r'[^\w\s\.,!?;:\-\(\)\'"]', ' ', text)
+        text = re.sub(r'\s+', ' ', text)
+        return text.strip()
     def number_to_words(self, num):
+        """Convert numbers to words"""
         if num == 0:
             return "zero"
+        # Keep larger numbers as digits to avoid very long text
         if num > 9999:
+            return str(num)
         ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
         teens = ["ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
         elif num < 1000:
             return ones[num // 100] + " hundred" + ("" if num % 100 == 0 else " " + self.number_to_words(num % 100))
         else:
+            thousands = num // 1000
+            remainder = num % 1000
+            result = self.number_to_words(thousands) + " thousand"
+            if remainder > 0:
+                result += " " + self.number_to_words(remainder)
+            return result
+    def chunk_text(self, text, max_length=400):
+        """Split text into manageable chunks"""
         sentences = sent_tokenize(text)
         chunks = []
         current_chunk = ""
         for sentence in sentences:
+            sentence = sentence.strip()
+            if not sentence:
+                continue
+            # If adding this sentence would exceed limit
+            if len(current_chunk + " " + sentence) > max_length:
+                # Save current chunk if it exists
+                if current_chunk:
+                    chunks.append(current_chunk.strip())
+                # If single sentence is too long, split it
+                if len(sentence) > max_length:
+                    words = sentence.split()
+                    temp_chunk = ""
+                    for word in words:
+                        if len(temp_chunk + " " + word) > max_length:
                             if temp_chunk:
+                                chunks.append(temp_chunk.strip())
+                                temp_chunk = word
+                            else:
+                                # Single word too long, just add it
+                                chunks.append(word)
+                        else:
+                            temp_chunk = temp_chunk + " " + word if temp_chunk else word
+                    current_chunk = temp_chunk
                 else:
+                    current_chunk = sentence
+            else:
+                current_chunk = current_chunk + " " + sentence if current_chunk else sentence
+        # Add the last chunk
         if current_chunk:
             chunks.append(current_chunk.strip())
         return [chunk for chunk in chunks if chunk.strip()]
     def generate_speech_chunk(self, text_chunk):
+        """Generate speech for a single chunk"""
         try:
+            # Process text through the model
+            inputs = self.processor(text=text_chunk, return_tensors="pt").to(self.device)
+            with torch.no_grad():
+                speech = self.model.generate_speech(
+                    inputs["input_ids"],
+                    self.speaker_embeddings,
+                    vocoder=self.vocoder
+                )
+            # Convert to numpy and move to CPU
+            if isinstance(speech, torch.Tensor):
+                speech = speech.cpu().numpy()
+            return speech
         except Exception as e:
             print(f"Error generating speech for chunk: {e}")
+            print(f"Chunk text: {text_chunk}")
+            return None
     def generate_long_speech(self, text, progress_callback=None):
+        """Generate speech for long text"""
         # Preprocess text
+        processed_text = self.preprocess_text(text)
+        print(f"Original length: {len(text)}, Processed length: {len(processed_text)}")
         # Split into chunks
+        chunks = self.chunk_text(processed_text)
+        print(f"Split into {len(chunks)} chunks")
         if not chunks:
             return None, None
         # Generate speech for each chunk
         audio_segments = []
+        sample_rate = 16000  # SpeechT5 uses 16kHz
         for i, chunk in enumerate(chunks):
             if progress_callback:
+                progress_callback(f"Processing chunk {i+1}/{len(chunks)}: {chunk[:40]}{'...' if len(chunk) > 40 else ''}")
+            print(f"Processing chunk {i+1}: {chunk}")
+            audio_chunk = self.generate_speech_chunk(chunk)
             if audio_chunk is not None and len(audio_chunk) > 0:
                 # Ensure audio is 1D
                 if len(audio_chunk.shape) > 1:
+                    audio_chunk = np.mean(audio_chunk, axis=0)
                 audio_segments.append(audio_chunk)
+                # Add pause between chunks (400ms)
+                pause_samples = int(0.4 * sample_rate)
+                silence = np.zeros(pause_samples)
                 audio_segments.append(silence)
             # Small delay to prevent overwhelming the system
         if not audio_segments:
             return None, None
+        # Concatenate all segments
         final_audio = np.concatenate(audio_segments)
+        # Normalize audio to prevent clipping
+        max_val = np.max(np.abs(final_audio))
+        if max_val > 0:
+            final_audio = final_audio / max_val * 0.95
+        return final_audio, sample_rate
+# Global TTS system
+print("🚀 Initializing TTS system...")
 try:
     tts_system = LongFormTTS()
+    print("✅ TTS system ready!")
 except Exception as e:
+    print(f"❌ TTS initialization failed: {e}")
     tts_system = None
 def text_to_speech_interface(text, progress=gr.Progress()):
+    """Main Gradio interface function"""
     if tts_system is None:
+        return None, "❌ TTS system is not available. Please check the logs."
+    if not text or not text.strip():
+        return None, "⚠️ Please enter some text to convert to speech."
+    # Text length check
+    if len(text) > 5000:
+        return None, "⚠️ Text is too long. Please keep it under 5,000 characters for optimal performance."
     def progress_callback(message):
         progress(0.5, desc=message)
     try:
+        progress(0.1, desc="🔄 Starting text-to-speech conversion...")
+        # Generate audio
         audio, sample_rate = tts_system.generate_long_speech(text, progress_callback)
         if audio is None or len(audio) == 0:
+            return None, "❌ Failed to generate audio. Please try with different text."
+        progress(0.9, desc="💾 Saving audio file...")
         # Save to temporary file
         with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
             sf.write(tmp_file.name, audio, sample_rate)
             audio_path = tmp_file.name
+        progress(1.0, desc="✅ Complete!")
         duration = len(audio) / sample_rate
+        return audio_path, f"✅ Generated {duration:.1f} seconds of audio successfully!"
     except Exception as e:
         error_msg = f"❌ Error: {str(e)}"
+        print(f"TTS Error: {e}")
         return None, error_msg
 # Create Gradio interface
 def create_interface():
     with gr.Blocks(
+        title="🎤 Long-Form Text-to-Speech",
         theme=gr.themes.Soft(),
         css="""
         .main-header {
             text-align: center;
             margin-bottom: 2rem;
+            background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
+            -webkit-background-clip: text;
+            -webkit-text-fill-color: transparent;
+            background-clip: text;
         }
         .feature-box {
             background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
             color: white;
+            padding: 1.5rem;
+            border-radius: 15px;
             margin: 1rem 0;
+            box-shadow: 0 4px 15px rgba(0,0,0,0.1);
         }
         .status-box {
             padding: 1rem;
+            border-radius: 10px;
             margin: 1rem 0;
+            border-left: 4px solid #28a745;
+            background: #f8f9fa;
         }
         """
     ) as demo:
         gr.HTML("""
         <div class="main-header">
             <h1>🎤 Long-Form Text-to-Speech Generator</h1>
+            <p style="color: #666; font-size: 1.1em;">Transform any text into natural human-like speech using advanced AI</p>
         </div>
         """)
+        # System status
+        if tts_system:
+            gr.HTML("""
             <div class="status-box">
+                <h4>🟢 System Ready</h4>
+                <p>Using <strong>Microsoft SpeechT5</strong> - High quality neural text-to-speech</p>
             </div>
+            """)
         else:
+            gr.HTML("""
+            <div class="status-box" style="border-left-color: #dc3545; background: #f8d7da;">
+                <h4>🔴 System Error</h4>
+                <p>TTS system failed to initialize. Please refresh the page.</p>
             </div>
+            """)
         with gr.Row():
             with gr.Column(scale=2):
                 text_input = gr.Textbox(
+                    label="📝 Enter Your Text",
+                    placeholder="Type or paste your text here... (Max 5,000 characters)",
+                    lines=10,
+                    max_lines=20,
+                    info="Supports any length text with automatic chunking for optimal quality"
                 )
+                char_count = gr.HTML("<span style='color: #666;'>Character count: 0 / 5,000</span>")
                 generate_btn = gr.Button(
                     "🎯 Generate Speech",
                     variant="primary",
+                    size="lg",
+                    scale=1
                 )
             with gr.Column(scale=1):
                 gr.HTML("""
                 <div class="feature-box">
+                    <h3>✨ Key Features</h3>
+                    <ul style="margin: 0; padding-left: 1.2em;">
+                        <li>🚀 Handles long texts</li>
+                        <li>🎭 Natural human voice</li>
+                        <li>⚡ Smart text processing</li>
+                        <li>🔧 Auto chunking</li>
                         <li>🆓 Completely free</li>
                         <li>📱 Mobile friendly</li>
+                        <li>🎵 High quality audio</li>
                     </ul>
                 </div>
                 """)
+        # Status and output
+        status_output = gr.Textbox(
             label="📊 Status",
             interactive=False,
+            value="Ready to generate speech! Enter some text above."
         )
         audio_output = gr.Audio(
             label="🔊 Generated Speech",
+            type="filepath",
+            show_download_button=True
         )
         # Character counter
         def update_char_count(text):
+            count = len(text) if text else 0
+            color = "#28a745" if count <= 5000 else "#dc3545"
+            return f'<span style="color: {color};">Character count: {count:,} / 5,000</span>'
         text_input.change(
             fn=update_char_count,
             outputs=[char_count]
         )
+        # Generate button click
         generate_btn.click(
             fn=text_to_speech_interface,
             inputs=[text_input],
+            outputs=[audio_output, status_output],
+            show_progress=True
         )
         # Example texts
         gr.Examples(
             examples=[
+                ["Hello! Welcome to our advanced text-to-speech system. This technology can convert any written text into natural-sounding human speech."],
+                ["The quick brown fox jumps over the lazy dog. This pangram contains every letter of the English alphabet and is perfect for testing speech synthesis."],
+                ["In the beginning was the Word, and the Word was with God, and the Word was God. This famous opening from the Gospel of John demonstrates the power of language."],
+                ["Artificial intelligence has revolutionized many aspects of our daily lives. From voice assistants to recommendation systems, AI technologies are becoming increasingly sophisticated and accessible to everyone."],
+                ["Once upon a time, in a land far away, there lived a wise old wizard who possessed the power to transform written words into spoken language. This magical ability brought stories to life for all who listened."]
             ],
+            inputs=[text_input],
+            label="📚 Try These Examples"
         )
+        # Information section
         gr.HTML("""
+        <div style="margin-top: 2rem; padding: 1.5rem; background: #f8f9fa; border-radius: 10px; border-left: 4px solid #007bff;">
+            <h4>🔧 How It Works</h4>
+            <ol style="margin: 0.5rem 0; padding-left: 1.5rem;">
+                <li><strong>Text Processing:</strong> Automatically cleans and normalizes your input text</li>
+                <li><strong>Smart Chunking:</strong> Splits long text at natural sentence boundaries</li>
+                <li><strong>Neural Synthesis:</strong> Uses Microsoft's SpeechT5 model for speech generation</li>
+                <li><strong>Audio Assembly:</strong> Combines all chunks with natural pauses</li>
             </ol>
+            <h4 style="margin-top: 1rem;">💡 Tips for Best Results</h4>
+            <ul style="margin: 0.5rem 0; padding-left: 1.5rem;">
+                <li>Use proper punctuation for natural pauses and intonation</li>
+                <li>Spell out abbreviations if you want them pronounced fully</li>
+                <li>Well-formatted text produces the most natural speech</li>
+                <li>The system automatically handles common abbreviations and numbers</li>
+            </ul>
         </div>
         """)
     return demo
+# Launch the application
 if __name__ == "__main__":
     demo = create_interface()
     demo.launch(