Edge-TTS-WebUI-Long-Text

Running

App Files Files Community

cs2764 commited on Dec 21, 2025

Commit

3c74b9d

verified ·

1 Parent(s): 11cf4ef

Add file upload

Browse files

Files changed (2) hide show

app.py +266 -14
requirements.txt +3 -0

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import asyncio
 import tempfile
 import os
 import re
 from pydub import AudioSegment
 import math
 import time
@@ -11,6 +12,24 @@ from datetime import datetime, timedelta
 import logging
 from text_cleaning import TextCleaner
 # Configure logging
 logging.basicConfig(
     level=logging.INFO,
@@ -21,6 +40,137 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
 async def get_voices():
     voices = await edge_tts.list_voices()
     return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
@@ -190,7 +340,7 @@ async def merge_audio_files(audio_paths):
     logger.info(f"Merged audio saved to {merged_path} (Total size: {total_size / 1024 / 1024:.2f} MB)")
     return merged_path
-async def text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_options=None):
     """Generate speech with detailed progress tracking via generator"""
     if not text.strip():
         yield None, "Please enter text to convert.", None
@@ -228,6 +378,15 @@ async def text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_op
     yield 0, "Starting text processing...", None
     logger.info(f"Starting TTS for text with estimated duration: {estimated_duration:.2f}m")
     if estimated_duration > 15:  # If longer than 15 minutes, split into segments
         segments = split_text_by_paragraphs(text)
         total_segments = len(segments)
@@ -264,9 +423,25 @@ async def text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_op
             # Merge all audio objects
             merged_audio_path = await merge_audio_files(audio_objects)
             yield 100, "Audio generation complete! ✅", segment_info
-            yield merged_audio_path, "Done", segment_info
             return
     # For short texts or single segment, use original method
@@ -278,17 +453,41 @@ async def text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_op
         tmp_path = tmp_file.name
         await communicate.save(tmp_path)
-    logger.info(f"Audio generated at {tmp_path}")
     yield 100, "Audio generation complete! ✅", None
-    yield tmp_path, "Done", None
-async def tts_interface(text, voice, rate, volume, pitch,
                         enable_cleaning, save_cleaned, clean_urls, clean_html,
                         clean_markdown, clean_ads, fix_enc, tidy_ws, del_gutenberg,
                         del_special, wetext_norm):
     """Enhanced TTS interface with detailed progress tracking"""
     if not text.strip():
-        yield None, gr.update(visible=False), "Please enter text.", gr.update(visible=False)
         return
     if not voice:
         yield None, gr.update(visible=False), "Please select a voice.", gr.update(visible=False)
@@ -326,7 +525,7 @@ async def tts_interface(text, voice, rate, volume, pitch,
     # Reset UI
     yield None, gr.update(value="Starting...", visible=True), "Initializing...", gr.update(visible=False)
-    async for result in text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_options):
         if isinstance(result, tuple) and len(result) == 3:
             # Progress update
             progress_val, status_msg, segment_info = result
@@ -375,8 +574,15 @@ async def create_demo():
             with gr.Column():
                 text_input = gr.Textbox(label="Input Text", lines=8, placeholder="Enter your text here... Long texts will be automatically segmented if they exceed 15 minutes of speech time.")
                 # Add text analysis info
-                text_info = gr.Markdown("**Text Analysis**: Enter text to see estimated duration and segment count", visible=True)
                 with gr.Accordion("Text Cleaning Settings", open=True):
                     with gr.Row():
@@ -415,6 +621,14 @@ async def create_demo():
                     volume_slider = gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Volume (%)", step=1)
                     pitch_slider = gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch (Hz)", step=1)
                 generate_btn = gr.Button("Generate Audio", variant="primary")
             with gr.Column():
@@ -436,9 +650,17 @@ async def create_demo():
         gr.Markdown("Experience the power of Edge TTS for text-to-speech conversion, and explore our advanced Text-to-Video Converter for even more creative possibilities!")
         # Add text analysis function
-        def analyze_text(text):
-            if not text.strip():
-                return "**Text Analysis**: Enter text to see estimated duration and segment count"
             duration = estimate_text_duration(text)
             word_count = len(text.split())
@@ -451,18 +673,48 @@ async def create_demo():
             else:
                 return f"**Text Analysis**: {word_count} words, {char_count} characters, ~{duration:.1f} minutes speech time"
         # Update text analysis when text changes
         text_input.change(
             fn=analyze_text,
-            inputs=[text_input],
             outputs=[text_info]
         )
         generate_btn.click(
             fn=tts_interface,
             inputs=[
-                text_input, voice_dropdown, rate_slider, volume_slider, pitch_slider,
-                enable_cleaning, save_cleaned, clean_urls, clean_html,
                 clean_markdown, clean_ads, fix_enc, tidy_ws, del_gutenberg,
                 del_special, wetext_norm
             ],

 import tempfile
 import os
 import re
+import shutil
 from pydub import AudioSegment
 import math
 import time
 import logging
 from text_cleaning import TextCleaner
+# EPUB parsing
+try:
+    import ebooklib
+    from ebooklib import epub
+    from bs4 import BeautifulSoup
+    EPUB_SUPPORT = True
+except ImportError:
+    EPUB_SUPPORT = False
+    logging.warning("ebooklib or beautifulsoup4 not installed. EPUB support disabled.")
+# Encoding detection
+try:
+    import chardet
+    CHARDET_SUPPORT = True
+except ImportError:
+    CHARDET_SUPPORT = False
+    logging.warning("chardet not installed. Encoding detection will use fallback method.")
 # Configure logging
 logging.basicConfig(
     level=logging.INFO,
 )
 logger = logging.getLogger(__name__)
+def detect_file_encoding(file_path):
+    """Detect file encoding using chardet or fallback method"""
+    if CHARDET_SUPPORT:
+        with open(file_path, 'rb') as f:
+            raw_data = f.read()
+            result = chardet.detect(raw_data)
+            encoding = result['encoding']
+            confidence = result['confidence']
+            logger.info(f"Detected encoding: {encoding} (confidence: {confidence:.2%})")
+            # Handle common encoding aliases
+            if encoding:
+                encoding_lower = encoding.lower()
+                # Map common aliases to standard names
+                encoding_map = {
+                    'gb2312': 'gbk',  # GBK is superset of GB2312
+                    'gb18030': 'gb18030',
+                    'ascii': 'utf-8',  # ASCII is subset of UTF-8
+                    'iso-8859-1': 'latin-1',
+                    'windows-1252': 'cp1252',
+                }
+                encoding = encoding_map.get(encoding_lower, encoding)
+            return encoding
+    else:
+        # Fallback: try common encodings
+        return None
+def read_text_file_with_encoding(file_path):
+    """Read text file with automatic encoding detection"""
+    # First try chardet detection
+    detected_encoding = detect_file_encoding(file_path)
+    # Priority list of encodings to try
+    # Common encodings for Chinese: UTF-8, GBK, GB2312, GB18030
+    # Common encodings for English/Western: UTF-8, Latin-1, CP1252
+    encodings_to_try = []
+    if detected_encoding:
+        encodings_to_try.append(detected_encoding)
+    # Add common encodings as fallback
+    encodings_to_try.extend([
+        'utf-8',
+        'utf-8-sig',  # UTF-8 with BOM
+        'gbk',        # Chinese (simplified)
+        'gb18030',    # Chinese (extended)
+        'big5',       # Chinese (traditional)
+        'utf-16',
+        'latin-1',    # Western European
+        'cp1252',     # Windows Western
+        'shift_jis',  # Japanese
+        'euc-kr',     # Korean
+    ])
+    # Remove duplicates while preserving order
+    seen = set()
+    unique_encodings = []
+    for enc in encodings_to_try:
+        if enc and enc.lower() not in seen:
+            seen.add(enc.lower())
+            unique_encodings.append(enc)
+    last_error = None
+    for encoding in unique_encodings:
+        try:
+            with open(file_path, 'r', encoding=encoding) as f:
+                text = f.read()
+                # Validate: check if text contains too many replacement characters
+                if text.count('\ufffd') > len(text) * 0.1:  # More than 10% replacement chars
+                    logger.debug(f"Encoding {encoding} produced too many replacement characters, trying next...")
+                    continue
+                logger.info(f"Successfully read file with encoding: {encoding}")
+                return text
+        except (UnicodeDecodeError, LookupError) as e:
+            last_error = e
+            logger.debug(f"Failed to decode with {encoding}: {e}")
+            continue
+    logger.error(f"Failed to decode file with any encoding. Last error: {last_error}")
+    return None
+def parse_uploaded_file(file_path):
+    """Parse uploaded txt or epub file and return text content and filename"""
+    if file_path is None:
+        return None, None
+    filename = os.path.splitext(os.path.basename(file_path))[0]
+    ext = os.path.splitext(file_path)[1].lower()
+    if ext == '.txt':
+        text = read_text_file_with_encoding(file_path)
+        if text:
+            logger.info(f"Parsed TXT file: {filename}, {len(text)} chars")
+            return text, filename
+        else:
+            logger.error(f"Failed to decode TXT file: {filename}")
+            return None, filename
+    elif ext == '.epub':
+        if not EPUB_SUPPORT:
+            logger.error("EPUB support not available")
+            return None, filename
+        try:
+            book = epub.read_epub(file_path)
+            text_parts = []
+            for item in book.get_items():
+                if item.get_type() == ebooklib.ITEM_DOCUMENT:
+                    soup = BeautifulSoup(item.get_content(), 'html.parser')
+                    text_parts.append(soup.get_text(separator='\n'))
+            text = '\n\n'.join(text_parts)
+            logger.info(f"Parsed EPUB file: {filename}, {len(text)} chars")
+            return text, filename
+        except Exception as e:
+            logger.error(f"Failed to parse EPUB: {e}")
+            return None, filename
+    return None, None
+async def convert_to_m4b(mp3_path, output_filename):
+    """Convert MP3 to M4B format using pydub"""
+    try:
+        audio = AudioSegment.from_mp3(mp3_path)
+        m4b_path = tempfile.NamedTemporaryFile(delete=False, suffix=".m4b").name
+        # Export as m4a (m4b is essentially m4a with audiobook metadata)
+        audio.export(m4b_path, format="ipod", codec="aac")
+        logger.info(f"Converted to M4B: {m4b_path}")
+        return m4b_path
+    except Exception as e:
+        logger.error(f"Failed to convert to M4B: {e}")
+        return None
 async def get_voices():
     voices = await edge_tts.list_voices()
     return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
     logger.info(f"Merged audio saved to {merged_path} (Total size: {total_size / 1024 / 1024:.2f} MB)")
     return merged_path
+async def text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_options=None, output_format="mp3", output_filename=None):
     """Generate speech with detailed progress tracking via generator"""
     if not text.strip():
         yield None, "Please enter text to convert.", None
     yield 0, "Starting text processing...", None
     logger.info(f"Starting TTS for text with estimated duration: {estimated_duration:.2f}m")
+    # Generate output filename with timestamp
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    if output_filename:
+        final_filename = f"{output_filename}_{timestamp}"
+    else:
+        final_filename = f"audio_{timestamp}"
+    final_audio_path = None
     if estimated_duration > 15:  # If longer than 15 minutes, split into segments
         segments = split_text_by_paragraphs(text)
         total_segments = len(segments)
             # Merge all audio objects
             merged_audio_path = await merge_audio_files(audio_objects)
+            final_audio_path = merged_audio_path
+            # Convert to M4B if requested
+            if output_format == "m4b" and merged_audio_path:
+                yield 95, "Converting to M4B format...", segment_info
+                m4b_path = await convert_to_m4b(merged_audio_path, final_filename)
+                if m4b_path:
+                    os.remove(merged_audio_path)
+                    final_audio_path = m4b_path
+            # Rename to final filename
+            if final_audio_path:
+                ext = ".m4b" if output_format == "m4b" else ".mp3"
+                new_path = os.path.join(os.path.dirname(final_audio_path), f"{final_filename}{ext}")
+                shutil.move(final_audio_path, new_path)
+                final_audio_path = new_path
             yield 100, "Audio generation complete! ✅", segment_info
+            yield final_audio_path, "Done", segment_info
             return
     # For short texts or single segment, use original method
         tmp_path = tmp_file.name
         await communicate.save(tmp_path)
+    final_audio_path = tmp_path
+    # Convert to M4B if requested
+    if output_format == "m4b":
+        yield 80, "Converting to M4B format...", None
+        m4b_path = await convert_to_m4b(tmp_path, final_filename)
+        if m4b_path:
+            os.remove(tmp_path)
+            final_audio_path = m4b_path
+    # Rename to final filename
+    if final_audio_path:
+        ext = ".m4b" if output_format == "m4b" else ".mp3"
+        new_path = os.path.join(os.path.dirname(final_audio_path), f"{final_filename}{ext}")
+        shutil.move(final_audio_path, new_path)
+        final_audio_path = new_path
+    logger.info(f"Audio generated at {final_audio_path}")
     yield 100, "Audio generation complete! ✅", None
+    yield final_audio_path, "Done", None
+async def tts_interface(text, uploaded_file, voice, rate, volume, pitch, output_format,
                         enable_cleaning, save_cleaned, clean_urls, clean_html,
                         clean_markdown, clean_ads, fix_enc, tidy_ws, del_gutenberg,
                         del_special, wetext_norm):
     """Enhanced TTS interface with detailed progress tracking"""
+    # Get output filename from uploaded file (if any)
+    output_filename = None
+    if uploaded_file is not None:
+        output_filename = os.path.splitext(os.path.basename(uploaded_file))[0]
+        logger.info(f"Using filename from uploaded file: {output_filename}")
     if not text.strip():
+        yield None, gr.update(visible=True, value="Please enter text or upload a file."), "No text provided", gr.update(visible=False)
         return
     if not voice:
         yield None, gr.update(visible=False), "Please select a voice.", gr.update(visible=False)
     # Reset UI
     yield None, gr.update(value="Starting...", visible=True), "Initializing...", gr.update(visible=False)
+    async for result in text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_options, output_format, output_filename):
         if isinstance(result, tuple) and len(result) == 3:
             # Progress update
             progress_val, status_msg, segment_info = result
             with gr.Column():
                 text_input = gr.Textbox(label="Input Text", lines=8, placeholder="Enter your text here... Long texts will be automatically segmented if they exceed 15 minutes of speech time.")
+                # File upload component
+                file_upload = gr.File(
+                    label="Or Upload File (TXT/EPUB)",
+                    file_types=[".txt", ".epub"],
+                    type="filepath"
+                )
                 # Add text analysis info
+                text_info = gr.Markdown("**Text Analysis**: Enter text or upload a file to see estimated duration and segment count", visible=True)
                 with gr.Accordion("Text Cleaning Settings", open=True):
                     with gr.Row():
                     volume_slider = gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Volume (%)", step=1)
                     pitch_slider = gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch (Hz)", step=1)
+                # Output format selection
+                output_format = gr.Radio(
+                    choices=["mp3", "m4b"],
+                    value="mp3",
+                    label="Output Format",
+                    info="MP3 is default. M4B is audiobook format (requires ffmpeg)."
+                )
                 generate_btn = gr.Button("Generate Audio", variant="primary")
             with gr.Column():
         gr.Markdown("Experience the power of Edge TTS for text-to-speech conversion, and explore our advanced Text-to-Video Converter for even more creative possibilities!")
         # Add text analysis function
+        def analyze_text(text, uploaded_file):
+            # If file is uploaded, parse it first
+            if uploaded_file is not None:
+                file_text, filename = parse_uploaded_file(uploaded_file)
+                if file_text:
+                    text = file_text
+                else:
+                    return f"**Text Analysis**: Failed to parse uploaded file"
+            if not text or not text.strip():
+                return "**Text Analysis**: Enter text or upload a file to see estimated duration and segment count"
             duration = estimate_text_duration(text)
             word_count = len(text.split())
             else:
                 return f"**Text Analysis**: {word_count} words, {char_count} characters, ~{duration:.1f} minutes speech time"
+        # Handle file upload - show preview in text box
+        def on_file_upload(uploaded_file):
+            if uploaded_file is None:
+                return gr.update(), "**Text Analysis**: Enter text or upload a file to see estimated duration and segment count"
+            file_text, filename = parse_uploaded_file(uploaded_file)
+            if file_text:
+                # Calculate analysis
+                duration = estimate_text_duration(file_text)
+                word_count = len(file_text.split())
+                char_count = len(file_text)
+                if duration > 15:
+                    segments = split_text_by_paragraphs(file_text)
+                    segment_count = len(segments)
+                    analysis = f"**Text Analysis**: {word_count} words, {char_count} characters, ~{duration:.1f} minutes speech time, {segment_count} segments will be generated"
+                else:
+                    analysis = f"**Text Analysis**: {word_count} words, {char_count} characters, ~{duration:.1f} minutes speech time"
+                return gr.update(value=file_text), analysis
+            else:
+                return gr.update(), "**Text Analysis**: Failed to parse uploaded file"
         # Update text analysis when text changes
         text_input.change(
             fn=analyze_text,
+            inputs=[text_input, file_upload],
             outputs=[text_info]
         )
+        # Update text box and analysis when file is uploaded
+        file_upload.change(
+            fn=on_file_upload,
+            inputs=[file_upload],
+            outputs=[text_input, text_info]
+        )
         generate_btn.click(
             fn=tts_interface,
             inputs=[
+                text_input, file_upload, voice_dropdown, rate_slider, volume_slider, pitch_slider,
+                output_format, enable_cleaning, save_cleaned, clean_urls, clean_html,
                 clean_markdown, clean_ads, fix_enc, tidy_ws, del_gutenberg,
                 del_special, wetext_norm
             ],

requirements.txt CHANGED Viewed

@@ -1,3 +1,6 @@
 edge_tts==7.0.0
 gradio>=5.0.0
 pydub>=0.25.1

 edge_tts==7.0.0
 gradio>=5.0.0
 pydub>=0.25.1
+ebooklib>=0.18
+beautifulsoup4>=4.12.0
+chardet>=5.0.0