Edge-TTS-WebUI-Long-Text

Sleeping

App Files Files Community

cs2764 commited on Nov 19, 2025

Commit

11efcf3

verified ·

1 Parent(s): e5d9894

Upload 2 files

Browse files

Add long text support

Files changed (2) hide show

app.py +319 -28
requirements.txt +3 -2

app.py CHANGED Viewed

@@ -3,32 +3,273 @@ import edge_tts
 import asyncio
 import tempfile
 import os
 async def get_voices():
     voices = await edge_tts.list_voices()
     return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
-async def text_to_speech(text, voice, rate, volume, pitch):
     if not text.strip():
-        return None, "Please enter text to convert."
     if not voice:
-        return None, "Please select a voice."
     voice_short_name = voice.split(" - ")[0]
     rate_str = f"{rate:+d}%"
     volume_str = f"{volume:+d}%"
     pitch_str = f"{pitch:+d}Hz"
     communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
         tmp_path = tmp_file.name
         await communicate.save(tmp_path)
-    return tmp_path, None
 async def tts_interface(text, voice, rate, volume, pitch):
-    audio, warning = await text_to_speech(text, voice, rate, volume, pitch)
-    if warning:
-        return audio, gr.Warning(warning)
-    return audio, None
 async def create_demo():
     voices = await get_voices()
@@ -46,6 +287,9 @@ async def create_demo():
     • Choose from 40+ languages and 300+ voices
     • Perfect for creating audiobooks, storytelling, and language learning materials
     • Ideal for educators, content creators, and language enthusiasts
     """
     default_voice = ""
@@ -54,26 +298,73 @@ async def create_demo():
             default_voice = voice_key
             break
-    demo = gr.Interface(
-        fn=tts_interface,
-        inputs=[
-            gr.Textbox(label="Input Text", lines=5),
-            gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
-            gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
-            gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Volume Adjustment (%)", step=1),
-            gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1)
-        ],
-        outputs=[
-            gr.Audio(label="Generated Audio", type="filepath"),
-            gr.Markdown(label="Warning", visible=False)
-        ],
-        title="Edge TTS Text-to-Speech",
-        description=description,
-        article="Experience the power of Edge TTS for text-to-speech conversion, and explore our advanced Text-to-Video Converter for even more creative possibilities!",
-        analytics_enabled=False,
-        allow_flagging="manual",
-        api_name=None
-    )
     return demo
 async def main():

 import asyncio
 import tempfile
 import os
+import re
+from pydub import AudioSegment
+import math
+import time
+from datetime import datetime, timedelta
+import logging
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
 async def get_voices():
     voices = await edge_tts.list_voices()
     return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
+def format_time_remaining(seconds):
+    """Format seconds into human readable time remaining"""
+    if seconds < 60:
+        return f"{int(seconds)}s"
+    elif seconds < 3600:
+        minutes = seconds / 60
+        return f"{minutes:.1f}m"
+    else:
+        hours = seconds / 3600
+        return f"{hours:.1f}h"
+def calculate_eta(start_time, completed_items, total_items):
+    """Calculate estimated time remaining"""
+    if completed_items == 0:
+        return "Calculating..."
+    elapsed_time = time.time() - start_time
+    time_per_item = elapsed_time / completed_items
+    remaining_items = total_items - completed_items
+    remaining_time = time_per_item * remaining_items
+    return format_time_remaining(remaining_time)
+def estimate_text_duration(text):
+    """Estimate speech duration in minutes based on text length"""
+    # Simple heuristic:
+    # For English (space-separated), ~150 words/min
+    # For Chinese (no spaces), ~300 chars/min
+    # We'll use a hybrid approach: count spaces to guess if it's space-separated.
+    if not text:
+        return 0
+    space_count = text.count(' ')
+    total_len = len(text)
+    # If spaces are < 10% of length, assume non-space-separated (like Chinese)
+    if space_count / total_len < 0.1:
+        # Approx 300 chars per minute for Chinese
+        duration = total_len / 300
+        logger.info(f"Estimated duration (char-based): {duration:.2f} min ({total_len} chars)")
+    else:
+        # Approx 150 words per minute for English
+        word_count = len(text.split())
+        duration = word_count / 150
+        logger.info(f"Estimated duration (word-based): {duration:.2f} min ({word_count} words)")
+    return duration
+def split_text_by_paragraphs(text, max_duration_minutes=5):
+    """Split text into segments that won't exceed limit with safety margin"""
+    max_duration = max_duration_minutes
+    estimated_duration = estimate_text_duration(text)
+    logger.info(f"Checking segmentation: Duration={estimated_duration:.2f}m, Limit={max_duration}m")
+    if estimated_duration <= max_duration:
+        return [text]
+    logger.info(f"Text duration ({estimated_duration:.2f}m) exceeds limit ({max_duration}m). Splitting...")
+    # Split by paragraphs first
+    paragraphs = text.split('\n\n')
+    segments = []
+    current_segment = ""
+    for paragraph in paragraphs:
+        paragraph_duration = estimate_text_duration(paragraph)
+        # If single paragraph is too long, split by sentences
+        if paragraph_duration > max_duration:
+            sentences = re.split(r'[.!?]+', paragraph)
+            for sentence in sentences:
+                sentence = sentence.strip()
+                if not sentence:
+                    continue
+                if estimate_text_duration(current_segment + sentence) > max_duration and current_segment:
+                    segments.append(current_segment.strip())
+                    current_segment = sentence + ". "
+                else:
+                    current_segment += sentence + ". "
+        else:
+            if estimate_text_duration(current_segment + paragraph) > max_duration and current_segment:
+                segments.append(current_segment.strip())
+                current_segment = paragraph + "\n\n"
+            else:
+                current_segment += paragraph + "\n\n"
+    if current_segment.strip():
+        segments.append(current_segment.strip())
+    logger.info(f"Split text into {len(segments)} segments.")
+    return segments
+async def generate_audio_segment(text_segment, voice_short_name, rate_str, volume_str, pitch_str, segment_index):
+    """Generate audio for a single text segment"""
+    logger.info(f"Generating segment {segment_index}...")
+    communicate = edge_tts.Communicate(text_segment, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str)
+    with tempfile.NamedTemporaryFile(delete=False, suffix=f"_segment_{segment_index}.mp3") as tmp_file:
+        tmp_path = tmp_file.name
+        await communicate.save(tmp_path)
+    # Verify segment duration
+    try:
+        seg_audio = AudioSegment.from_mp3(tmp_path)
+        duration_min = len(seg_audio) / 1000 / 60
+        logger.info(f"Segment {segment_index} generated at {tmp_path} (Duration: {duration_min:.2f} min)")
+    except Exception as e:
+        logger.error(f"Error checking segment {segment_index} duration: {e}")
+    return tmp_path
+async def merge_audio_files(audio_files):
+    """Merge multiple audio files into one"""
+    if not audio_files:
+        return None
+    if len(audio_files) == 1:
+        return audio_files[0]
+    logger.info(f"Merging {len(audio_files)} audio files...")
+    # Load and merge audio segments
+    combined = AudioSegment.empty()
+    for audio_file in audio_files:
+        try:
+            segment = AudioSegment.from_mp3(audio_file)
+            combined += segment
+        except Exception as e:
+            logger.error(f"Error merging file {audio_file}: {e}")
+        # Clean up temporary segment file
+        try:
+            os.remove(audio_file)
+        except:
+            pass
+    # Save merged audio
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
+        merged_path = tmp_file.name
+        combined.export(merged_path, format="mp3")
+    total_duration_min = len(combined) / 1000 / 60
+    logger.info(f"Merged audio saved to {merged_path} (Total Duration: {total_duration_min:.2f} min)")
+    return merged_path
+async def text_to_speech_generator(text, voice, rate, volume, pitch):
+    """Generate speech with detailed progress tracking via generator"""
     if not text.strip():
+        yield None, "Please enter text to convert.", None
+        return
     if not voice:
+        yield None, "Please select a voice.", None
+        return
     voice_short_name = voice.split(" - ")[0]
     rate_str = f"{rate:+d}%"
     volume_str = f"{volume:+d}%"
     pitch_str = f"{pitch:+d}Hz"
+    # Check if text is too long and needs segmentation
+    estimated_duration = estimate_text_duration(text)
+    yield 0, "Starting text processing...", None
+    logger.info(f"Starting TTS for text with estimated duration: {estimated_duration:.2f}m")
+    if estimated_duration > 15:  # If longer than 15 minutes, split into segments
+        segments = split_text_by_paragraphs(text)
+        total_segments = len(segments)
+        segment_info = f"Text split into {total_segments} segments. Total estimated duration: {estimated_duration:.1f} min"
+        yield 5, segment_info, segment_info
+        if total_segments > 1:
+            # Generate audio for each segment with progress tracking
+            audio_files = []
+            start_time = time.time()
+            for i, segment in enumerate(segments):
+                if segment.strip():
+                    segment_duration = estimate_text_duration(segment)
+                    progress = 10 + (80 * i / total_segments)  # 10% to 90%
+                    eta = calculate_eta(start_time, i, total_segments)
+                    status_msg = (
+                        f"Generating segment {i+1}/{total_segments}...\n"
+                        f"Segment duration: {segment_duration:.1f} min\n"
+                        f"ETA: {eta}"
+                    )
+                    logger.info(f"Progress: {status_msg.replace(chr(10), ', ')}")
+                    yield progress, status_msg, segment_info
+                    audio_file = await generate_audio_segment(
+                        segment, voice_short_name, rate_str, volume_str, pitch_str, i+1
+                    )
+                    audio_files.append(audio_file)
+            yield 90, "Merging audio files...", segment_info
+            # Merge all audio files
+            merged_audio = await merge_audio_files(audio_files)
+            yield 100, "Audio generation complete! ✅", segment_info
+            yield merged_audio, "Done", segment_info
+            return
+    # For short texts or single segment, use original method
+    yield 50, "Generating audio...", None
+    logger.info("Generating single segment audio...")
     communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
         tmp_path = tmp_file.name
         await communicate.save(tmp_path)
+    logger.info(f"Audio generated at {tmp_path}")
+    yield 100, "Audio generation complete! ✅", None
+    yield tmp_path, "Done", None
 async def tts_interface(text, voice, rate, volume, pitch):
+    """Enhanced TTS interface with detailed progress tracking"""
+    if not text.strip():
+        yield None, gr.update(visible=False), "Please enter text.", gr.update(visible=False)
+        return
+    if not voice:
+        yield None, gr.update(visible=False), "Please select a voice.", gr.update(visible=False)
+        return
+    estimated_duration = estimate_text_duration(text)
+    # Reset UI
+    yield None, gr.update(value="Starting...", visible=True), "Initializing...", gr.update(visible=False)
+    async for result in text_to_speech_generator(text, voice, rate, volume, pitch):
+        if isinstance(result, tuple) and len(result) == 3:
+            # Progress update
+            progress_val, status_msg, segment_info = result
+            if isinstance(progress_val, (int, float)):
+                # It's a progress update
+                segment_update = gr.update(value=segment_info, visible=True) if segment_info else gr.update(visible=False)
+                yield None, gr.update(value=status_msg, visible=True), status_msg, segment_update
+            else:
+                # It's the final result (path, msg, info)
+                audio_path = progress_val
+                yield audio_path, gr.update(value="Complete!", visible=True), "Generation Complete", gr.update(visible=True)
 async def create_demo():
     voices = await get_voices()
     • Choose from 40+ languages and 300+ voices
     • Perfect for creating audiobooks, storytelling, and language learning materials
     • Ideal for educators, content creators, and language enthusiasts
+    📝 **Long Text Support**:
+    Texts longer than 15 minutes will be **automatically segmented** into smaller chunks for processing and then **merged back** into a single high-quality audio file. This ensures stability and allows for unlimited text length!
     """
     default_voice = ""
             default_voice = voice_key
             break
+    with gr.Blocks(title="Edge TTS Text-to-Speech") as demo:
+        gr.Markdown("# Edge TTS Text-to-Speech")
+        gr.Markdown(description)
+        with gr.Row():
+            with gr.Column():
+                text_input = gr.Textbox(label="Input Text", lines=8, placeholder="Enter your text here... Long texts will be automatically segmented if they exceed 15 minutes of speech time.")
+                # Add text analysis info
+                text_info = gr.Markdown("**Text Analysis**: Enter text to see estimated duration and segment count", visible=True)
+                voice_dropdown = gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice)
+                with gr.Row():
+                    rate_slider = gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate (%)", step=1)
+                    volume_slider = gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Volume (%)", step=1)
+                    pitch_slider = gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch (Hz)", step=1)
+                generate_btn = gr.Button("Generate Audio", variant="primary")
+            with gr.Column():
+                audio_output = gr.Audio(label="Generated Audio", type="filepath")
+                # Progress and status display
+                with gr.Group():
+                    gr.Markdown("### 📊 Processing Progress")
+                    progress_info = gr.Markdown("Ready, click Generate to start...", visible=True)
+                # Processing details
+                with gr.Accordion("🔍 Processing Details", open=True) as processing_details:
+                    status_output = gr.Markdown("Waiting...", visible=True)
+                # Segment information display
+                with gr.Accordion("📋 Segment Information", open=True) as segment_info:
+                    segment_details = gr.Markdown("Segment details will appear here for long texts", visible=True)
+        gr.Markdown("Experience the power of Edge TTS for text-to-speech conversion, and explore our advanced Text-to-Video Converter for even more creative possibilities!")
+        # Add text analysis function
+        def analyze_text(text):
+            if not text.strip():
+                return "**Text Analysis**: Enter text to see estimated duration and segment count"
+            duration = estimate_text_duration(text)
+            word_count = len(text.split())
+            char_count = len(text)
+            if duration > 15:
+                segments = split_text_by_paragraphs(text)
+                segment_count = len(segments)
+                return f"**Text Analysis**: {word_count} words, {char_count} characters, ~{duration:.1f} minutes speech time, {segment_count} segments will be generated"
+            else:
+                return f"**Text Analysis**: {word_count} words, {char_count} characters, ~{duration:.1f} minutes speech time"
+        # Update text analysis when text changes
+        text_input.change(
+            fn=analyze_text,
+            inputs=[text_input],
+            outputs=[text_info]
+        )
+        generate_btn.click(
+            fn=tts_interface,
+            inputs=[text_input, voice_dropdown, rate_slider, volume_slider, pitch_slider],
+            outputs=[audio_output, progress_info, status_output, segment_details]
+        )
     return demo
 async def main():

requirements.txt CHANGED Viewed

@@ -1,2 +1,3 @@
-edge_tts==7.0.0
-gradio==5.21.0

+edge_tts>=7.0.0
+gradio>=4.0.0
+pydub>=0.25.1