Edge-TTS-WebUI-Long-Text

Sleeping

App Files Files Community

cs2764 commited on Nov 30, 2025

Commit

f9ed71b

verified ·

1 Parent(s): d098a55

Upload 2 files

Browse files

Files changed (2) hide show

app.py +480 -476
text_cleaning.py +44 -0

app.py CHANGED Viewed

@@ -1,476 +1,480 @@
-import gradio as gr
-import edge_tts
-import asyncio
-import tempfile
-import os
-import re
-from pydub import AudioSegment
-import math
-import time
-from datetime import datetime, timedelta
-import logging
-from text_cleaning import TextCleaner
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(levelname)s - %(message)s',
-    handlers=[
-        logging.StreamHandler()
-    ]
-)
-logger = logging.getLogger(__name__)
-async def get_voices():
-    voices = await edge_tts.list_voices()
-    return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
-def format_time_remaining(seconds):
-    """Format seconds into human readable time remaining"""
-    if seconds < 60:
-        return f"{int(seconds)}s"
-    elif seconds < 3600:
-        minutes = seconds / 60
-        return f"{minutes:.1f}m"
-    else:
-        hours = seconds / 3600
-        return f"{hours:.1f}h"
-def calculate_eta(start_time, completed_items, total_items):
-    """Calculate estimated time remaining"""
-    if completed_items == 0:
-        return "Calculating..."
-    elapsed_time = time.time() - start_time
-    time_per_item = elapsed_time / completed_items
-    remaining_items = total_items - completed_items
-    remaining_time = time_per_item * remaining_items
-    return format_time_remaining(remaining_time)
-def estimate_text_duration(text):
-    """Estimate speech duration in minutes based on text length"""
-    # Simple heuristic:
-    # For English (space-separated), ~150 words/min
-    # For Chinese (no spaces), ~300 chars/min
-    # We'll use a hybrid approach: count spaces to guess if it's space-separated.
-    if not text:
-        return 0
-    space_count = text.count(' ')
-    total_len = len(text)
-    # If spaces are < 10% of length, assume non-space-separated (like Chinese)
-    if space_count / total_len < 0.1:
-        # Approx 300 chars per minute for Chinese
-        duration = total_len / 300
-        # logger.debug(f"Estimated duration (char-based): {duration:.2f} min ({total_len} chars)")
-    else:
-        # Approx 150 words per minute for English
-        word_count = len(text.split())
-        duration = word_count / 150
-        # logger.debug(f"Estimated duration (word-based): {duration:.2f} min ({word_count} words)")
-    return duration
-def split_text_by_paragraphs(text, max_duration_minutes=5, max_chars=500):
-    """Split text into segments that won't exceed limit with safety margin"""
-    max_duration = max_duration_minutes
-    estimated_duration = estimate_text_duration(text)
-    logger.info(f"Checking segmentation: Duration={estimated_duration:.2f}m, Chars={len(text)}, Limit={max_duration}m/{max_chars}chars")
-    if estimated_duration <= max_duration and len(text) <= max_chars:
-        return [text]
-    logger.info(f"Text exceeds limits. Splitting...")
-    # Split by paragraphs first
-    paragraphs = text.split('\n\n')
-    segments = []
-    current_segment = ""
-    for paragraph in paragraphs:
-        paragraph_duration = estimate_text_duration(paragraph)
-        # If single paragraph is too long, split by sentences
-        # Improved regex to include Chinese punctuation
-        if paragraph_duration > max_duration or len(paragraph) > max_chars:
-            sentences = re.split(r'([.!?。！？]+)', paragraph)
-            # Re-attach delimiters to sentences
-            real_sentences = []
-            for i in range(0, len(sentences) - 1, 2):
-                real_sentences.append(sentences[i] + sentences[i+1])
-            if len(sentences) % 2 == 1 and sentences[-1]:
-                real_sentences.append(sentences[-1])
-            for sentence in real_sentences:
-                sentence = sentence.strip()
-                if not sentence:
-                    continue
-                # Check both duration and char count
-                if (estimate_text_duration(current_segment + sentence) > max_duration or
-                    len(current_segment + sentence) > max_chars) and current_segment:
-                    segments.append(current_segment.strip())
-                    current_segment = sentence
-                else:
-                    current_segment += sentence
-        else:
-            if (estimate_text_duration(current_segment + paragraph) > max_duration or
-                len(current_segment + paragraph) > max_chars) and current_segment:
-                segments.append(current_segment.strip())
-                current_segment = paragraph + "\n\n"
-            else:
-                current_segment += paragraph + "\n\n"
-    if current_segment.strip():
-        segments.append(current_segment.strip())
-    logger.info(f"Split text into {len(segments)} segments.")
-    return segments
-import io
-async def generate_audio_segment(text_segment, voice_short_name, rate_str, volume_str, pitch_str, segment_index):
-    """Generate audio for a single text segment and return as BytesIO"""
-    logger.info(f"Generating segment {segment_index}...")
-    communicate = edge_tts.Communicate(text_segment, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str)
-    audio_data = io.BytesIO()
-    try:
-        async for chunk in communicate.stream():
-            if chunk["type"] == "audio":
-                audio_data.write(chunk["data"])
-    except Exception as e:
-        logger.error(f"Error generating segment {segment_index} (Length: {len(text_segment)} chars): {e}")
-        raise gr.Error(f"Error generating segment {segment_index}: {e}")
-    audio_data.seek(0)
-    # Verify segment duration
-    try:
-        # Make a copy for verification so we don't consume the main buffer
-        verify_buffer = io.BytesIO(audio_data.getvalue())
-        seg_audio = AudioSegment.from_mp3(verify_buffer)
-        duration_min = len(seg_audio) / 1000 / 60
-        logger.info(f"Segment {segment_index} generated in memory (Duration: {duration_min:.2f} min)")
-    except Exception as e:
-        logger.error(f"Error checking segment {segment_index} duration: {e}")
-    audio_data.seek(0)
-    return audio_data
-async def merge_audio_files(audio_objects):
-    """Merge multiple audio BytesIO objects into one file"""
-    if not audio_objects:
-        return None
-    logger.info(f"Merging {len(audio_objects)} audio segments...")
-    # Load and merge audio segments
-    combined = AudioSegment.empty()
-    for i, audio_obj in enumerate(audio_objects):
-        try:
-            audio_obj.seek(0)
-            segment = AudioSegment.from_mp3(audio_obj)
-            combined += segment
-            # Explicitly close/clear the BytesIO object to free memory
-            audio_obj.close()
-        except Exception as e:
-            logger.error(f"Error merging segment {i+1}: {e}")
-    # Save merged audio to a single temporary file
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
-        merged_path = tmp_file.name
-        combined.export(merged_path, format="mp3")
-    total_duration_min = len(combined) / 1000 / 60
-    logger.info(f"Merged audio saved to {merged_path} (Total Duration: {total_duration_min:.2f} min)")
-    return merged_path
-async def text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_options=None):
-    """Generate speech with detailed progress tracking via generator"""
-    if not text.strip():
-        yield None, "Please enter text to convert.", None
-        return
-    if not voice:
-        yield None, "Please select a voice.", None
-        return
-    # Apply text cleaning if enabled
-    if cleaning_options and cleaning_options.get('enable_cleaning', False):
-        yield 0, "Cleaning text...", None
-        # original_text = text # Unused
-        text = TextCleaner.clean_text(text, cleaning_options)
-        if cleaning_options.get('save_cleaned', False):
-            # Create a filename based on timestamp or first few words
-            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-            filename = f"text_{timestamp}.txt"
-            saved_path = TextCleaner.save_cleaned_text(text, filename)
-            if saved_path:
-                logger.info(f"Saved cleaned text to {saved_path}")
-        if not text.strip():
-            yield None, "Text cleaning resulted in empty text.", None
-            return
-    voice_short_name = voice.split(" - ")[0]
-    rate_str = f"{rate:+d}%"
-    volume_str = f"{volume:+d}%"
-    pitch_str = f"{pitch:+d}Hz"
-    # Check if text is too long and needs segmentation
-    estimated_duration = estimate_text_duration(text)
-    yield 0, "Starting text processing...", None
-    logger.info(f"Starting TTS for text with estimated duration: {estimated_duration:.2f}m")
-    if estimated_duration > 15:  # If longer than 15 minutes, split into segments
-        segments = split_text_by_paragraphs(text)
-        total_segments = len(segments)
-        segment_info = f"Text split into {total_segments} segments. Total estimated duration: {estimated_duration:.1f} min"
-        yield 5, segment_info, segment_info
-        if total_segments > 1:
-            # Generate audio for each segment with progress tracking
-            audio_objects = []
-            start_time = time.time()
-            for i, segment in enumerate(segments):
-                if segment.strip():
-                    segment_duration = estimate_text_duration(segment)
-                    progress = 10 + (80 * i / total_segments)  # 10% to 90%
-                    eta = calculate_eta(start_time, i, total_segments)
-                    status_msg = (
-                        f"Generating segment {i+1}/{total_segments}...\n"
-                        f"Segment duration: {segment_duration:.1f} min\n"
-                        f"ETA: {eta}"
-                    )
-                    logger.info(f"Progress: {status_msg.replace(chr(10), ', ')}")
-                    yield progress, status_msg, segment_info
-                    # Generate to memory
-                    audio_obj = await generate_audio_segment(
-                        segment, voice_short_name, rate_str, volume_str, pitch_str, i+1
-                    )
-                    audio_objects.append(audio_obj)
-            yield 90, "Merging audio files...", segment_info
-            # Merge all audio objects
-            merged_audio_path = await merge_audio_files(audio_objects)
-            yield 100, "Audio generation complete! ✅", segment_info
-            yield merged_audio_path, "Done", segment_info
-            return
-    # For short texts or single segment, use original method
-    yield 50, "Generating audio...", None
-    logger.info("Generating single segment audio...")
-    communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str)
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
-        tmp_path = tmp_file.name
-        await communicate.save(tmp_path)
-    logger.info(f"Audio generated at {tmp_path}")
-    yield 100, "Audio generation complete! ✅", None
-    yield tmp_path, "Done", None
-async def tts_interface(text, voice, rate, volume, pitch,
-                        enable_cleaning, save_cleaned, clean_urls, clean_html,
-                        clean_ads, fix_enc, tidy_ws, del_gutenberg,
-                        del_special, wetext_norm):
-    """Enhanced TTS interface with detailed progress tracking"""
-    if not text.strip():
-        yield None, gr.update(visible=False), "Please enter text.", gr.update(visible=False)
-        return
-    if not voice:
-        yield None, gr.update(visible=False), "Please select a voice.", gr.update(visible=False)
-        return
-    # Prepare cleaning options
-    cleaning_options = {
-        'enable_cleaning': enable_cleaning,
-        'save_cleaned': save_cleaned,
-        'remove_urls': clean_urls,
-        'remove_html': clean_html,
-        'filter_ads': clean_ads,
-        'fix_encoding': fix_enc,
-        'tidy_whitespace': tidy_ws,
-        'remove_gutenberg': del_gutenberg,
-        'remove_special_chars': del_special,
-        'wetext_normalization': wetext_norm
-    }
-    # We need to clean text here first to estimate duration correctly?
-    # Or let the generator handle it. The generator handles it, but estimation might be off.
-    # Ideally we clean first if enabled, then estimate.
-    working_text = text
-    if enable_cleaning:
-        working_text = TextCleaner.clean_text(text, cleaning_options)
-        if save_cleaned:
-             # We'll let the generator save it to avoid double saving or complex logic here,
-             # but we need to pass the options.
-             pass
-    estimated_duration = estimate_text_duration(working_text)
-    # Reset UI
-    yield None, gr.update(value="Starting...", visible=True), "Initializing...", gr.update(visible=False)
-    async for result in text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_options):
-        if isinstance(result, tuple) and len(result) == 3:
-            # Progress update
-            progress_val, status_msg, segment_info = result
-            if isinstance(progress_val, (int, float)):
-                # It's a progress update
-                segment_update = gr.update(value=segment_info, visible=True) if segment_info else gr.update(visible=False)
-                yield None, gr.update(value=status_msg, visible=True), status_msg, segment_update
-            else:
-                # It's the final result (path, msg, info)
-                audio_path = progress_val
-                yield audio_path, gr.update(value="Complete!", visible=True), "Generation Complete", gr.update(visible=True)
-async def create_demo():
-    voices = await get_voices()
-    description = """
-    Convert text to speech using Microsoft Edge TTS. Adjust speech rate and pitch: 0 is default, positive values increase, negative values decrease.
-    🎥 **Exciting News: Introducing our Text-to-Video Converter!** 🎥
-    Take your content creation to the next level with our cutting-edge Text-to-Video Converter!
-    Transform your words into stunning, professional-quality videos in just a few clicks.
-    ✨ Features:
-    • Convert text to engaging videos with customizable visuals
-    • Choose from 40+ languages and 300+ voices
-    • Perfect for creating audiobooks, storytelling, and language learning materials
-    • Ideal for educators, content creators, and language enthusiasts
-    📝 **Long Text Support**:
-    Texts longer than 15 minutes will be **automatically segmented** into smaller chunks for processing and then **merged back** into a single high-quality audio file. This ensures stability and allows for unlimited text length!
-    """
-    default_voice = ""
-    for voice_key in voices.keys():
-        if "XiaoxiaoNeural" in voice_key:
-            default_voice = voice_key
-            break
-    with gr.Blocks(title="Edge TTS Text-to-Speech") as demo:
-        gr.Markdown("# Edge TTS Text-to-Speech")
-        gr.Markdown(description)
-        with gr.Row():
-            with gr.Column():
-                text_input = gr.Textbox(label="Input Text", lines=8, placeholder="Enter your text here... Long texts will be automatically segmented if they exceed 15 minutes of speech time.")
-                # Add text analysis info
-                text_info = gr.Markdown("**Text Analysis**: Enter text to see estimated duration and segment count", visible=True)
-                with gr.Accordion("Text Cleaning Settings", open=True):
-                    with gr.Row():
-                        enable_cleaning = gr.Checkbox(label="Enable Text Cleaning", value=True)
-                        save_cleaned = gr.Checkbox(label="Save Cleaned Text File", value=True)
-                    with gr.Group(visible=True) as cleaning_options_group:
-                        with gr.Row():
-                            clean_urls = gr.Checkbox(label="Remove URLs", value=True)
-                            clean_html = gr.Checkbox(label="Remove HTML", value=True)
-                        with gr.Row():
-                            clean_ads = gr.Checkbox(label="Filter Ads", value=True)
-                            fix_enc = gr.Checkbox(label="Fix Encoding", value=True)
-                        with gr.Row():
-                            tidy_ws = gr.Checkbox(label="Tidy Whitespace", value=True)
-                            del_gutenberg = gr.Checkbox(label="Remove Project Gutenberg", value=True)
-                        with gr.Row():
-                            del_special = gr.Checkbox(label="Remove Special Characters", value=True)
-                            wetext_norm = gr.Checkbox(label="Enable WeText Normalization", value=True)
-                    def toggle_options(enabled):
-                        return gr.update(visible=enabled)
-                    enable_cleaning.change(fn=toggle_options, inputs=[enable_cleaning], outputs=[cleaning_options_group])
-                voice_dropdown = gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice)
-                with gr.Row():
-                    rate_slider = gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate (%)", step=1)
-                    volume_slider = gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Volume (%)", step=1)
-                    pitch_slider = gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch (Hz)", step=1)
-                generate_btn = gr.Button("Generate Audio", variant="primary")
-            with gr.Column():
-                audio_output = gr.Audio(label="Generated Audio", type="filepath")
-                # Progress and status display
-                with gr.Group():
-                    gr.Markdown("### 📊 Processing Progress")
-                    progress_info = gr.Markdown("Ready, click Generate to start...", visible=True)
-                # Processing details
-                with gr.Accordion("🔍 Processing Details", open=True) as processing_details:
-                    status_output = gr.Markdown("Waiting...", visible=True)
-                # Segment information display
-                with gr.Accordion("📋 Segment Information", open=True) as segment_info:
-                    segment_details = gr.Markdown("Segment details will appear here for long texts", visible=True)
-        gr.Markdown("Experience the power of Edge TTS for text-to-speech conversion, and explore our advanced Text-to-Video Converter for even more creative possibilities!")
-        # Add text analysis function
-        def analyze_text(text):
-            if not text.strip():
-                return "**Text Analysis**: Enter text to see estimated duration and segment count"
-            duration = estimate_text_duration(text)
-            word_count = len(text.split())
-            char_count = len(text)
-            if duration > 15:
-                segments = split_text_by_paragraphs(text)
-                segment_count = len(segments)
-                return f"**Text Analysis**: {word_count} words, {char_count} characters, ~{duration:.1f} minutes speech time, {segment_count} segments will be generated"
-            else:
-                return f"**Text Analysis**: {word_count} words, {char_count} characters, ~{duration:.1f} minutes speech time"
-        # Update text analysis when text changes
-        text_input.change(
-            fn=analyze_text,
-            inputs=[text_input],
-            outputs=[text_info]
-        )
-        generate_btn.click(
-            fn=tts_interface,
-            inputs=[
-                text_input, voice_dropdown, rate_slider, volume_slider, pitch_slider,
-                enable_cleaning, save_cleaned, clean_urls, clean_html,
-                clean_ads, fix_enc, tidy_ws, del_gutenberg,
-                del_special, wetext_norm
-            ],
-            outputs=[audio_output, progress_info, status_output, segment_details]
-        )
-    return demo
-async def main():
-    demo = await create_demo()
-    demo.queue(default_concurrency_limit=5)
-    demo.launch(show_api=False)
-if __name__ == "__main__":
-    asyncio.run(main())

+import gradio as gr
+import edge_tts
+import asyncio
+import tempfile
+import os
+import re
+from pydub import AudioSegment
+import math
+import time
+from datetime import datetime, timedelta
+import logging
+from text_cleaning import TextCleaner
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+async def get_voices():
+    voices = await edge_tts.list_voices()
+    return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
+def format_time_remaining(seconds):
+    """Format seconds into human readable time remaining"""
+    if seconds < 60:
+        return f"{int(seconds)}s"
+    elif seconds < 3600:
+        minutes = seconds / 60
+        return f"{minutes:.1f}m"
+    else:
+        hours = seconds / 3600
+        return f"{hours:.1f}h"
+def calculate_eta(start_time, completed_items, total_items):
+    """Calculate estimated time remaining"""
+    if completed_items == 0:
+        return "Calculating..."
+    elapsed_time = time.time() - start_time
+    time_per_item = elapsed_time / completed_items
+    remaining_items = total_items - completed_items
+    remaining_time = time_per_item * remaining_items
+    return format_time_remaining(remaining_time)
+def estimate_text_duration(text):
+    """Estimate speech duration in minutes based on text length"""
+    # Simple heuristic:
+    # For English (space-separated), ~150 words/min
+    # For Chinese (no spaces), ~300 chars/min
+    # We'll use a hybrid approach: count spaces to guess if it's space-separated.
+    if not text:
+        return 0
+    space_count = text.count(' ')
+    total_len = len(text)
+    # If spaces are < 10% of length, assume non-space-separated (like Chinese)
+    if space_count / total_len < 0.1:
+        # Approx 300 chars per minute for Chinese
+        duration = total_len / 300
+        # logger.debug(f"Estimated duration (char-based): {duration:.2f} min ({total_len} chars)")
+    else:
+        # Approx 150 words per minute for English
+        word_count = len(text.split())
+        duration = word_count / 150
+        # logger.debug(f"Estimated duration (word-based): {duration:.2f} min ({word_count} words)")
+    return duration
+def split_text_by_paragraphs(text, max_duration_minutes=5, max_chars=500):
+    """Split text into segments that won't exceed limit with safety margin"""
+    max_duration = max_duration_minutes
+    estimated_duration = estimate_text_duration(text)
+    logger.info(f"Checking segmentation: Duration={estimated_duration:.2f}m, Chars={len(text)}, Limit={max_duration}m/{max_chars}chars")
+    if estimated_duration <= max_duration and len(text) <= max_chars:
+        return [text]
+    logger.info(f"Text exceeds limits. Splitting...")
+    # Split by paragraphs first
+    paragraphs = text.split('\n\n')
+    segments = []
+    current_segment = ""
+    for paragraph in paragraphs:
+        paragraph_duration = estimate_text_duration(paragraph)
+        # If single paragraph is too long, split by sentences
+        # Improved regex to include Chinese punctuation
+        if paragraph_duration > max_duration or len(paragraph) > max_chars:
+            sentences = re.split(r'([.!?。！？]+)', paragraph)
+            # Re-attach delimiters to sentences
+            real_sentences = []
+            for i in range(0, len(sentences) - 1, 2):
+                real_sentences.append(sentences[i] + sentences[i+1])
+            if len(sentences) % 2 == 1 and sentences[-1]:
+                real_sentences.append(sentences[-1])
+            for sentence in real_sentences:
+                sentence = sentence.strip()
+                if not sentence:
+                    continue
+                # Check both duration and char count
+                if (estimate_text_duration(current_segment + sentence) > max_duration or
+                    len(current_segment + sentence) > max_chars) and current_segment:
+                    segments.append(current_segment.strip())
+                    current_segment = sentence
+                else:
+                    current_segment += sentence
+        else:
+            if (estimate_text_duration(current_segment + paragraph) > max_duration or
+                len(current_segment + paragraph) > max_chars) and current_segment:
+                segments.append(current_segment.strip())
+                current_segment = paragraph + "\n\n"
+            else:
+                current_segment += paragraph + "\n\n"
+    if current_segment.strip():
+        segments.append(current_segment.strip())
+    logger.info(f"Split text into {len(segments)} segments.")
+    return segments
+import io
+async def generate_audio_segment(text_segment, voice_short_name, rate_str, volume_str, pitch_str, segment_index):
+    """Generate audio for a single text segment and return as BytesIO"""
+    logger.info(f"Generating segment {segment_index}...")
+    communicate = edge_tts.Communicate(text_segment, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str)
+    audio_data = io.BytesIO()
+    try:
+        async for chunk in communicate.stream():
+            if chunk["type"] == "audio":
+                audio_data.write(chunk["data"])
+    except Exception as e:
+        logger.error(f"Error generating segment {segment_index} (Length: {len(text_segment)} chars): {e}")
+        raise gr.Error(f"Error generating segment {segment_index}: {e}")
+    audio_data.seek(0)
+    # Verify segment duration
+    try:
+        # Make a copy for verification so we don't consume the main buffer
+        verify_buffer = io.BytesIO(audio_data.getvalue())
+        seg_audio = AudioSegment.from_mp3(verify_buffer)
+        duration_min = len(seg_audio) / 1000 / 60
+        logger.info(f"Segment {segment_index} generated in memory (Duration: {duration_min:.2f} min)")
+    except Exception as e:
+        logger.error(f"Error checking segment {segment_index} duration: {e}")
+    audio_data.seek(0)
+    return audio_data
+async def merge_audio_files(audio_objects):
+    """Merge multiple audio BytesIO objects into one file"""
+    if not audio_objects:
+        return None
+    logger.info(f"Merging {len(audio_objects)} audio segments...")
+    # Load and merge audio segments
+    combined = AudioSegment.empty()
+    for i, audio_obj in enumerate(audio_objects):
+        try:
+            audio_obj.seek(0)
+            segment = AudioSegment.from_mp3(audio_obj)
+            combined += segment
+            # Explicitly close/clear the BytesIO object to free memory
+            audio_obj.close()
+        except Exception as e:
+            logger.error(f"Error merging segment {i+1}: {e}")
+    # Save merged audio to a single temporary file
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
+        merged_path = tmp_file.name
+        combined.export(merged_path, format="mp3")
+    total_duration_min = len(combined) / 1000 / 60
+    logger.info(f"Merged audio saved to {merged_path} (Total Duration: {total_duration_min:.2f} min)")
+    return merged_path
+async def text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_options=None):
+    """Generate speech with detailed progress tracking via generator"""
+    if not text.strip():
+        yield None, "Please enter text to convert.", None
+        return
+    if not voice:
+        yield None, "Please select a voice.", None
+        return
+    # Apply text cleaning if enabled
+    if cleaning_options and cleaning_options.get('enable_cleaning', False):
+        yield 0, "Cleaning text...", None
+        # original_text = text # Unused
+        text = TextCleaner.clean_text(text, cleaning_options)
+        if cleaning_options.get('save_cleaned', False):
+            # Create a filename based on timestamp or first few words
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            filename = f"text_{timestamp}.txt"
+            saved_path = TextCleaner.save_cleaned_text(text, filename)
+            if saved_path:
+                logger.info(f"Saved cleaned text to {saved_path}")
+        if not text.strip():
+            yield None, "Text cleaning resulted in empty text.", None
+            return
+    voice_short_name = voice.split(" - ")[0]
+    rate_str = f"{rate:+d}%"
+    volume_str = f"{volume:+d}%"
+    pitch_str = f"{pitch:+d}Hz"
+    # Check if text is too long and needs segmentation
+    estimated_duration = estimate_text_duration(text)
+    yield 0, "Starting text processing...", None
+    logger.info(f"Starting TTS for text with estimated duration: {estimated_duration:.2f}m")
+    if estimated_duration > 15:  # If longer than 15 minutes, split into segments
+        segments = split_text_by_paragraphs(text)
+        total_segments = len(segments)
+        segment_info = f"Text split into {total_segments} segments. Total estimated duration: {estimated_duration:.1f} min"
+        yield 5, segment_info, segment_info
+        if total_segments > 1:
+            # Generate audio for each segment with progress tracking
+            audio_objects = []
+            start_time = time.time()
+            for i, segment in enumerate(segments):
+                if segment.strip():
+                    segment_duration = estimate_text_duration(segment)
+                    progress = 10 + (80 * i / total_segments)  # 10% to 90%
+                    eta = calculate_eta(start_time, i, total_segments)
+                    status_msg = (
+                        f"Generating segment {i+1}/{total_segments}...\n"
+                        f"Segment duration: {segment_duration:.1f} min\n"
+                        f"ETA: {eta}"
+                    )
+                    logger.info(f"Progress: {status_msg.replace(chr(10), ', ')}")
+                    yield progress, status_msg, segment_info
+                    # Generate to memory
+                    audio_obj = await generate_audio_segment(
+                        segment, voice_short_name, rate_str, volume_str, pitch_str, i+1
+                    )
+                    audio_objects.append(audio_obj)
+            yield 90, "Merging audio files...", segment_info
+            # Merge all audio objects
+            merged_audio_path = await merge_audio_files(audio_objects)
+            yield 100, "Audio generation complete! ✅", segment_info
+            yield merged_audio_path, "Done", segment_info
+            return
+    # For short texts or single segment, use original method
+    yield 50, "Generating audio...", None
+    logger.info("Generating single segment audio...")
+    communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str)
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
+        tmp_path = tmp_file.name
+        await communicate.save(tmp_path)
+    logger.info(f"Audio generated at {tmp_path}")
+    yield 100, "Audio generation complete! ✅", None
+    yield tmp_path, "Done", None
+async def tts_interface(text, voice, rate, volume, pitch,
+                        enable_cleaning, save_cleaned, clean_urls, clean_html,
+                        clean_markdown, clean_ads, fix_enc, tidy_ws, del_gutenberg,
+                        del_special, wetext_norm):
+    """Enhanced TTS interface with detailed progress tracking"""
+    if not text.strip():
+        yield None, gr.update(visible=False), "Please enter text.", gr.update(visible=False)
+        return
+    if not voice:
+        yield None, gr.update(visible=False), "Please select a voice.", gr.update(visible=False)
+        return
+    # Prepare cleaning options
+    cleaning_options = {
+        'enable_cleaning': enable_cleaning,
+        'save_cleaned': save_cleaned,
+        'remove_urls': clean_urls,
+        'remove_html': clean_html,
+        'remove_markdown': clean_markdown,
+        'filter_ads': clean_ads,
+        'fix_encoding': fix_enc,
+        'tidy_whitespace': tidy_ws,
+        'remove_gutenberg': del_gutenberg,
+        'remove_special_chars': del_special,
+        'wetext_normalization': wetext_norm
+    }
+    # We need to clean text here first to estimate duration correctly?
+    # Or let the generator handle it. The generator handles it, but estimation might be off.
+    # Ideally we clean first if enabled, then estimate.
+    working_text = text
+    if enable_cleaning:
+        working_text = TextCleaner.clean_text(text, cleaning_options)
+        if save_cleaned:
+             # We'll let the generator save it to avoid double saving or complex logic here,
+             # but we need to pass the options.
+             pass
+    estimated_duration = estimate_text_duration(working_text)
+    # Reset UI
+    yield None, gr.update(value="Starting...", visible=True), "Initializing...", gr.update(visible=False)
+    async for result in text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_options):
+        if isinstance(result, tuple) and len(result) == 3:
+            # Progress update
+            progress_val, status_msg, segment_info = result
+            if isinstance(progress_val, (int, float)):
+                # It's a progress update
+                segment_update = gr.update(value=segment_info, visible=True) if segment_info else gr.update(visible=False)
+                yield None, gr.update(value=status_msg, visible=True), status_msg, segment_update
+            else:
+                # It's the final result (path, msg, info)
+                audio_path = progress_val
+                yield audio_path, gr.update(value="Complete!", visible=True), "Generation Complete", gr.update(visible=True)
+async def create_demo():
+    voices = await get_voices()
+    description = """
+    Convert text to speech using Microsoft Edge TTS. Adjust speech rate and pitch: 0 is default, positive values increase, negative values decrease.
+    🎥 **Exciting News: Introducing our Text-to-Video Converter!** 🎥
+    Take your content creation to the next level with our cutting-edge Text-to-Video Converter!
+    Transform your words into stunning, professional-quality videos in just a few clicks.
+    ✨ Features:
+    • Convert text to engaging videos with customizable visuals
+    • Choose from 40+ languages and 300+ voices
+    • Perfect for creating audiobooks, storytelling, and language learning materials
+    • Ideal for educators, content creators, and language enthusiasts
+    📝 **Long Text Support**:
+    Texts longer than 15 minutes will be **automatically segmented** into smaller chunks for processing and then **merged back** into a single high-quality audio file. This ensures stability and allows for unlimited text length!
+    """
+    default_voice = ""
+    for voice_key in voices.keys():
+        if "XiaoxiaoNeural" in voice_key:
+            default_voice = voice_key
+            break
+    with gr.Blocks(title="Edge TTS Text-to-Speech") as demo:
+        gr.Markdown("# Edge TTS Text-to-Speech")
+        gr.Markdown(description)
+        with gr.Row():
+            with gr.Column():
+                text_input = gr.Textbox(label="Input Text", lines=8, placeholder="Enter your text here... Long texts will be automatically segmented if they exceed 15 minutes of speech time.")
+                # Add text analysis info
+                text_info = gr.Markdown("**Text Analysis**: Enter text to see estimated duration and segment count", visible=True)
+                with gr.Accordion("Text Cleaning Settings", open=True):
+                    with gr.Row():
+                        enable_cleaning = gr.Checkbox(label="Enable Text Cleaning", value=True)
+                        save_cleaned = gr.Checkbox(label="Save Cleaned Text File", value=True)
+                    with gr.Group(visible=True) as cleaning_options_group:
+                        with gr.Row():
+                            clean_urls = gr.Checkbox(label="Remove URLs", value=True)
+                            clean_html = gr.Checkbox(label="Remove HTML", value=True)
+                        with gr.Row():
+                            clean_markdown = gr.Checkbox(label="Remove Markdown", value=True)
+                            clean_ads = gr.Checkbox(label="Filter Ads", value=True)
+                        with gr.Row():
+                            fix_enc = gr.Checkbox(label="Fix Encoding", value=True)
+                            tidy_ws = gr.Checkbox(label="Tidy Whitespace", value=True)
+                        with gr.Row():
+                            del_gutenberg = gr.Checkbox(label="Remove Project Gutenberg", value=True)
+                            del_special = gr.Checkbox(label="Remove Special Characters", value=True)
+                        with gr.Row():
+                            wetext_norm = gr.Checkbox(label="Enable WeText Normalization", value=True)
+                    def toggle_options(enabled):
+                        return gr.update(visible=enabled)
+                    enable_cleaning.change(fn=toggle_options, inputs=[enable_cleaning], outputs=[cleaning_options_group])
+                voice_dropdown = gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice)
+                with gr.Row():
+                    rate_slider = gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate (%)", step=1)
+                    volume_slider = gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Volume (%)", step=1)
+                    pitch_slider = gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch (Hz)", step=1)
+                generate_btn = gr.Button("Generate Audio", variant="primary")
+            with gr.Column():
+                audio_output = gr.Audio(label="Generated Audio", type="filepath")
+                # Progress and status display
+                with gr.Group():
+                    gr.Markdown("### 📊 Processing Progress")
+                    progress_info = gr.Markdown("Ready, click Generate to start...", visible=True)
+                # Processing details
+                with gr.Accordion("🔍 Processing Details", open=True) as processing_details:
+                    status_output = gr.Markdown("Waiting...", visible=True)
+                # Segment information display
+                with gr.Accordion("📋 Segment Information", open=True) as segment_info:
+                    segment_details = gr.Markdown("Segment details will appear here for long texts", visible=True)
+        gr.Markdown("Experience the power of Edge TTS for text-to-speech conversion, and explore our advanced Text-to-Video Converter for even more creative possibilities!")
+        # Add text analysis function
+        def analyze_text(text):
+            if not text.strip():
+                return "**Text Analysis**: Enter text to see estimated duration and segment count"
+            duration = estimate_text_duration(text)
+            word_count = len(text.split())
+            char_count = len(text)
+            if duration > 15:
+                segments = split_text_by_paragraphs(text)
+                segment_count = len(segments)
+                return f"**Text Analysis**: {word_count} words, {char_count} characters, ~{duration:.1f} minutes speech time, {segment_count} segments will be generated"
+            else:
+                return f"**Text Analysis**: {word_count} words, {char_count} characters, ~{duration:.1f} minutes speech time"
+        # Update text analysis when text changes
+        text_input.change(
+            fn=analyze_text,
+            inputs=[text_input],
+            outputs=[text_info]
+        )
+        generate_btn.click(
+            fn=tts_interface,
+            inputs=[
+                text_input, voice_dropdown, rate_slider, volume_slider, pitch_slider,
+                enable_cleaning, save_cleaned, clean_urls, clean_html,
+                clean_markdown, clean_ads, fix_enc, tidy_ws, del_gutenberg,
+                del_special, wetext_norm
+            ],
+            outputs=[audio_output, progress_info, status_output, segment_details]
+        )
+    return demo
+async def main():
+    demo = await create_demo()
+    demo.queue(default_concurrency_limit=5)
+    demo.launch(show_api=False)
+if __name__ == "__main__":
+    asyncio.run(main())

text_cleaning.py CHANGED Viewed

@@ -82,6 +82,47 @@ class TextCleaner:
         return '\n'.join(lines[start_idx:end_idx])
     @staticmethod
     def remove_special_chars(text):
         """Remove excessive special characters"""
@@ -128,6 +169,9 @@ class TextCleaner:
         if options.get('remove_html', False):
             text = cls.remove_html(text)
         if options.get('remove_urls', False):
             text = cls.remove_urls(text)

         return '\n'.join(lines[start_idx:end_idx])
+    @staticmethod
+    def remove_markdown(text):
+        """Remove markdown formatting symbols"""
+        # Remove code blocks first (```code```)
+        text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)
+        # Remove inline code (`code`)
+        text = re.sub(r'`([^`]+)`', r'\1', text)
+        # Remove bold (**text** or __text__)
+        text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
+        text = re.sub(r'__(.+?)__', r'\1', text)
+        # Remove italic (*text* or _text_)
+        text = re.sub(r'\*(.+?)\*', r'\1', text)
+        text = re.sub(r'_(.+?)_', r'\1', text)
+        # Remove strikethrough (~~text~~)
+        text = re.sub(r'~~(.+?)~~', r'\1', text)
+        # Remove headers (# ## ### etc.)
+        text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE)
+        # Remove links [text](url) -> text
+        text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
+        # Remove images ![alt](url)
+        text = re.sub(r'!\[([^\]]*)\]\([^\)]+\)', r'\1', text)
+        # Remove blockquotes (> text)
+        text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE)
+        # Remove horizontal rules (---, ***, ___)
+        text = re.sub(r'^[\-\*_]{3,}\s*$', '', text, flags=re.MULTILINE)
+        # Remove list markers (-, *, +, 1., 2., etc.)
+        text = re.sub(r'^\s*[\-\*\+]\s+', '', text, flags=re.MULTILINE)
+        text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
+        return text
     @staticmethod
     def remove_special_chars(text):
         """Remove excessive special characters"""
         if options.get('remove_html', False):
             text = cls.remove_html(text)
+        if options.get('remove_markdown', False):
+            text = cls.remove_markdown(text)
         if options.get('remove_urls', False):
             text = cls.remove_urls(text)