Spaces:
Sleeping
Sleeping
Add file upload
Browse files- app.py +266 -14
- requirements.txt +3 -0
app.py
CHANGED
|
@@ -4,6 +4,7 @@ import asyncio
|
|
| 4 |
import tempfile
|
| 5 |
import os
|
| 6 |
import re
|
|
|
|
| 7 |
from pydub import AudioSegment
|
| 8 |
import math
|
| 9 |
import time
|
|
@@ -11,6 +12,24 @@ from datetime import datetime, timedelta
|
|
| 11 |
import logging
|
| 12 |
from text_cleaning import TextCleaner
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
# Configure logging
|
| 15 |
logging.basicConfig(
|
| 16 |
level=logging.INFO,
|
|
@@ -21,6 +40,137 @@ logging.basicConfig(
|
|
| 21 |
)
|
| 22 |
logger = logging.getLogger(__name__)
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
async def get_voices():
|
| 25 |
voices = await edge_tts.list_voices()
|
| 26 |
return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
|
|
@@ -190,7 +340,7 @@ async def merge_audio_files(audio_paths):
|
|
| 190 |
logger.info(f"Merged audio saved to {merged_path} (Total size: {total_size / 1024 / 1024:.2f} MB)")
|
| 191 |
return merged_path
|
| 192 |
|
| 193 |
-
async def text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_options=None):
|
| 194 |
"""Generate speech with detailed progress tracking via generator"""
|
| 195 |
if not text.strip():
|
| 196 |
yield None, "Please enter text to convert.", None
|
|
@@ -228,6 +378,15 @@ async def text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_op
|
|
| 228 |
yield 0, "Starting text processing...", None
|
| 229 |
logger.info(f"Starting TTS for text with estimated duration: {estimated_duration:.2f}m")
|
| 230 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
if estimated_duration > 15: # If longer than 15 minutes, split into segments
|
| 232 |
segments = split_text_by_paragraphs(text)
|
| 233 |
total_segments = len(segments)
|
|
@@ -264,9 +423,25 @@ async def text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_op
|
|
| 264 |
|
| 265 |
# Merge all audio objects
|
| 266 |
merged_audio_path = await merge_audio_files(audio_objects)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
|
| 268 |
yield 100, "Audio generation complete! ✅", segment_info
|
| 269 |
-
yield
|
| 270 |
return
|
| 271 |
|
| 272 |
# For short texts or single segment, use original method
|
|
@@ -278,17 +453,41 @@ async def text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_op
|
|
| 278 |
tmp_path = tmp_file.name
|
| 279 |
await communicate.save(tmp_path)
|
| 280 |
|
| 281 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
yield 100, "Audio generation complete! ✅", None
|
| 283 |
-
yield
|
| 284 |
|
| 285 |
-
async def tts_interface(text, voice, rate, volume, pitch,
|
| 286 |
enable_cleaning, save_cleaned, clean_urls, clean_html,
|
| 287 |
clean_markdown, clean_ads, fix_enc, tidy_ws, del_gutenberg,
|
| 288 |
del_special, wetext_norm):
|
| 289 |
"""Enhanced TTS interface with detailed progress tracking"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
if not text.strip():
|
| 291 |
-
yield None, gr.update(visible=
|
| 292 |
return
|
| 293 |
if not voice:
|
| 294 |
yield None, gr.update(visible=False), "Please select a voice.", gr.update(visible=False)
|
|
@@ -326,7 +525,7 @@ async def tts_interface(text, voice, rate, volume, pitch,
|
|
| 326 |
# Reset UI
|
| 327 |
yield None, gr.update(value="Starting...", visible=True), "Initializing...", gr.update(visible=False)
|
| 328 |
|
| 329 |
-
async for result in text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_options):
|
| 330 |
if isinstance(result, tuple) and len(result) == 3:
|
| 331 |
# Progress update
|
| 332 |
progress_val, status_msg, segment_info = result
|
|
@@ -375,8 +574,15 @@ async def create_demo():
|
|
| 375 |
with gr.Column():
|
| 376 |
text_input = gr.Textbox(label="Input Text", lines=8, placeholder="Enter your text here... Long texts will be automatically segmented if they exceed 15 minutes of speech time.")
|
| 377 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 378 |
# Add text analysis info
|
| 379 |
-
text_info = gr.Markdown("**Text Analysis**: Enter text to see estimated duration and segment count", visible=True)
|
| 380 |
|
| 381 |
with gr.Accordion("Text Cleaning Settings", open=True):
|
| 382 |
with gr.Row():
|
|
@@ -415,6 +621,14 @@ async def create_demo():
|
|
| 415 |
volume_slider = gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Volume (%)", step=1)
|
| 416 |
pitch_slider = gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch (Hz)", step=1)
|
| 417 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 418 |
generate_btn = gr.Button("Generate Audio", variant="primary")
|
| 419 |
|
| 420 |
with gr.Column():
|
|
@@ -436,9 +650,17 @@ async def create_demo():
|
|
| 436 |
gr.Markdown("Experience the power of Edge TTS for text-to-speech conversion, and explore our advanced Text-to-Video Converter for even more creative possibilities!")
|
| 437 |
|
| 438 |
# Add text analysis function
|
| 439 |
-
def analyze_text(text):
|
| 440 |
-
|
| 441 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 442 |
|
| 443 |
duration = estimate_text_duration(text)
|
| 444 |
word_count = len(text.split())
|
|
@@ -451,18 +673,48 @@ async def create_demo():
|
|
| 451 |
else:
|
| 452 |
return f"**Text Analysis**: {word_count} words, {char_count} characters, ~{duration:.1f} minutes speech time"
|
| 453 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 454 |
# Update text analysis when text changes
|
| 455 |
text_input.change(
|
| 456 |
fn=analyze_text,
|
| 457 |
-
inputs=[text_input],
|
| 458 |
outputs=[text_info]
|
| 459 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 460 |
|
| 461 |
generate_btn.click(
|
| 462 |
fn=tts_interface,
|
| 463 |
inputs=[
|
| 464 |
-
text_input, voice_dropdown, rate_slider, volume_slider, pitch_slider,
|
| 465 |
-
enable_cleaning, save_cleaned, clean_urls, clean_html,
|
| 466 |
clean_markdown, clean_ads, fix_enc, tidy_ws, del_gutenberg,
|
| 467 |
del_special, wetext_norm
|
| 468 |
],
|
|
|
|
| 4 |
import tempfile
|
| 5 |
import os
|
| 6 |
import re
|
| 7 |
+
import shutil
|
| 8 |
from pydub import AudioSegment
|
| 9 |
import math
|
| 10 |
import time
|
|
|
|
| 12 |
import logging
|
| 13 |
from text_cleaning import TextCleaner
|
| 14 |
|
| 15 |
+
# EPUB parsing
|
| 16 |
+
try:
|
| 17 |
+
import ebooklib
|
| 18 |
+
from ebooklib import epub
|
| 19 |
+
from bs4 import BeautifulSoup
|
| 20 |
+
EPUB_SUPPORT = True
|
| 21 |
+
except ImportError:
|
| 22 |
+
EPUB_SUPPORT = False
|
| 23 |
+
logging.warning("ebooklib or beautifulsoup4 not installed. EPUB support disabled.")
|
| 24 |
+
|
| 25 |
+
# Encoding detection
|
| 26 |
+
try:
|
| 27 |
+
import chardet
|
| 28 |
+
CHARDET_SUPPORT = True
|
| 29 |
+
except ImportError:
|
| 30 |
+
CHARDET_SUPPORT = False
|
| 31 |
+
logging.warning("chardet not installed. Encoding detection will use fallback method.")
|
| 32 |
+
|
| 33 |
# Configure logging
|
| 34 |
logging.basicConfig(
|
| 35 |
level=logging.INFO,
|
|
|
|
| 40 |
)
|
| 41 |
logger = logging.getLogger(__name__)
|
| 42 |
|
| 43 |
+
def detect_file_encoding(file_path):
|
| 44 |
+
"""Detect file encoding using chardet or fallback method"""
|
| 45 |
+
if CHARDET_SUPPORT:
|
| 46 |
+
with open(file_path, 'rb') as f:
|
| 47 |
+
raw_data = f.read()
|
| 48 |
+
result = chardet.detect(raw_data)
|
| 49 |
+
encoding = result['encoding']
|
| 50 |
+
confidence = result['confidence']
|
| 51 |
+
logger.info(f"Detected encoding: {encoding} (confidence: {confidence:.2%})")
|
| 52 |
+
|
| 53 |
+
# Handle common encoding aliases
|
| 54 |
+
if encoding:
|
| 55 |
+
encoding_lower = encoding.lower()
|
| 56 |
+
# Map common aliases to standard names
|
| 57 |
+
encoding_map = {
|
| 58 |
+
'gb2312': 'gbk', # GBK is superset of GB2312
|
| 59 |
+
'gb18030': 'gb18030',
|
| 60 |
+
'ascii': 'utf-8', # ASCII is subset of UTF-8
|
| 61 |
+
'iso-8859-1': 'latin-1',
|
| 62 |
+
'windows-1252': 'cp1252',
|
| 63 |
+
}
|
| 64 |
+
encoding = encoding_map.get(encoding_lower, encoding)
|
| 65 |
+
return encoding
|
| 66 |
+
else:
|
| 67 |
+
# Fallback: try common encodings
|
| 68 |
+
return None
|
| 69 |
+
|
| 70 |
+
def read_text_file_with_encoding(file_path):
|
| 71 |
+
"""Read text file with automatic encoding detection"""
|
| 72 |
+
# First try chardet detection
|
| 73 |
+
detected_encoding = detect_file_encoding(file_path)
|
| 74 |
+
|
| 75 |
+
# Priority list of encodings to try
|
| 76 |
+
# Common encodings for Chinese: UTF-8, GBK, GB2312, GB18030
|
| 77 |
+
# Common encodings for English/Western: UTF-8, Latin-1, CP1252
|
| 78 |
+
encodings_to_try = []
|
| 79 |
+
|
| 80 |
+
if detected_encoding:
|
| 81 |
+
encodings_to_try.append(detected_encoding)
|
| 82 |
+
|
| 83 |
+
# Add common encodings as fallback
|
| 84 |
+
encodings_to_try.extend([
|
| 85 |
+
'utf-8',
|
| 86 |
+
'utf-8-sig', # UTF-8 with BOM
|
| 87 |
+
'gbk', # Chinese (simplified)
|
| 88 |
+
'gb18030', # Chinese (extended)
|
| 89 |
+
'big5', # Chinese (traditional)
|
| 90 |
+
'utf-16',
|
| 91 |
+
'latin-1', # Western European
|
| 92 |
+
'cp1252', # Windows Western
|
| 93 |
+
'shift_jis', # Japanese
|
| 94 |
+
'euc-kr', # Korean
|
| 95 |
+
])
|
| 96 |
+
|
| 97 |
+
# Remove duplicates while preserving order
|
| 98 |
+
seen = set()
|
| 99 |
+
unique_encodings = []
|
| 100 |
+
for enc in encodings_to_try:
|
| 101 |
+
if enc and enc.lower() not in seen:
|
| 102 |
+
seen.add(enc.lower())
|
| 103 |
+
unique_encodings.append(enc)
|
| 104 |
+
|
| 105 |
+
last_error = None
|
| 106 |
+
for encoding in unique_encodings:
|
| 107 |
+
try:
|
| 108 |
+
with open(file_path, 'r', encoding=encoding) as f:
|
| 109 |
+
text = f.read()
|
| 110 |
+
# Validate: check if text contains too many replacement characters
|
| 111 |
+
if text.count('\ufffd') > len(text) * 0.1: # More than 10% replacement chars
|
| 112 |
+
logger.debug(f"Encoding {encoding} produced too many replacement characters, trying next...")
|
| 113 |
+
continue
|
| 114 |
+
logger.info(f"Successfully read file with encoding: {encoding}")
|
| 115 |
+
return text
|
| 116 |
+
except (UnicodeDecodeError, LookupError) as e:
|
| 117 |
+
last_error = e
|
| 118 |
+
logger.debug(f"Failed to decode with {encoding}: {e}")
|
| 119 |
+
continue
|
| 120 |
+
|
| 121 |
+
logger.error(f"Failed to decode file with any encoding. Last error: {last_error}")
|
| 122 |
+
return None
|
| 123 |
+
|
| 124 |
+
def parse_uploaded_file(file_path):
|
| 125 |
+
"""Parse uploaded txt or epub file and return text content and filename"""
|
| 126 |
+
if file_path is None:
|
| 127 |
+
return None, None
|
| 128 |
+
|
| 129 |
+
filename = os.path.splitext(os.path.basename(file_path))[0]
|
| 130 |
+
ext = os.path.splitext(file_path)[1].lower()
|
| 131 |
+
|
| 132 |
+
if ext == '.txt':
|
| 133 |
+
text = read_text_file_with_encoding(file_path)
|
| 134 |
+
if text:
|
| 135 |
+
logger.info(f"Parsed TXT file: {filename}, {len(text)} chars")
|
| 136 |
+
return text, filename
|
| 137 |
+
else:
|
| 138 |
+
logger.error(f"Failed to decode TXT file: {filename}")
|
| 139 |
+
return None, filename
|
| 140 |
+
|
| 141 |
+
elif ext == '.epub':
|
| 142 |
+
if not EPUB_SUPPORT:
|
| 143 |
+
logger.error("EPUB support not available")
|
| 144 |
+
return None, filename
|
| 145 |
+
try:
|
| 146 |
+
book = epub.read_epub(file_path)
|
| 147 |
+
text_parts = []
|
| 148 |
+
for item in book.get_items():
|
| 149 |
+
if item.get_type() == ebooklib.ITEM_DOCUMENT:
|
| 150 |
+
soup = BeautifulSoup(item.get_content(), 'html.parser')
|
| 151 |
+
text_parts.append(soup.get_text(separator='\n'))
|
| 152 |
+
text = '\n\n'.join(text_parts)
|
| 153 |
+
logger.info(f"Parsed EPUB file: {filename}, {len(text)} chars")
|
| 154 |
+
return text, filename
|
| 155 |
+
except Exception as e:
|
| 156 |
+
logger.error(f"Failed to parse EPUB: {e}")
|
| 157 |
+
return None, filename
|
| 158 |
+
|
| 159 |
+
return None, None
|
| 160 |
+
|
| 161 |
+
async def convert_to_m4b(mp3_path, output_filename):
|
| 162 |
+
"""Convert MP3 to M4B format using pydub"""
|
| 163 |
+
try:
|
| 164 |
+
audio = AudioSegment.from_mp3(mp3_path)
|
| 165 |
+
m4b_path = tempfile.NamedTemporaryFile(delete=False, suffix=".m4b").name
|
| 166 |
+
# Export as m4a (m4b is essentially m4a with audiobook metadata)
|
| 167 |
+
audio.export(m4b_path, format="ipod", codec="aac")
|
| 168 |
+
logger.info(f"Converted to M4B: {m4b_path}")
|
| 169 |
+
return m4b_path
|
| 170 |
+
except Exception as e:
|
| 171 |
+
logger.error(f"Failed to convert to M4B: {e}")
|
| 172 |
+
return None
|
| 173 |
+
|
| 174 |
async def get_voices():
|
| 175 |
voices = await edge_tts.list_voices()
|
| 176 |
return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
|
|
|
|
| 340 |
logger.info(f"Merged audio saved to {merged_path} (Total size: {total_size / 1024 / 1024:.2f} MB)")
|
| 341 |
return merged_path
|
| 342 |
|
| 343 |
+
async def text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_options=None, output_format="mp3", output_filename=None):
|
| 344 |
"""Generate speech with detailed progress tracking via generator"""
|
| 345 |
if not text.strip():
|
| 346 |
yield None, "Please enter text to convert.", None
|
|
|
|
| 378 |
yield 0, "Starting text processing...", None
|
| 379 |
logger.info(f"Starting TTS for text with estimated duration: {estimated_duration:.2f}m")
|
| 380 |
|
| 381 |
+
# Generate output filename with timestamp
|
| 382 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 383 |
+
if output_filename:
|
| 384 |
+
final_filename = f"{output_filename}_{timestamp}"
|
| 385 |
+
else:
|
| 386 |
+
final_filename = f"audio_{timestamp}"
|
| 387 |
+
|
| 388 |
+
final_audio_path = None
|
| 389 |
+
|
| 390 |
if estimated_duration > 15: # If longer than 15 minutes, split into segments
|
| 391 |
segments = split_text_by_paragraphs(text)
|
| 392 |
total_segments = len(segments)
|
|
|
|
| 423 |
|
| 424 |
# Merge all audio objects
|
| 425 |
merged_audio_path = await merge_audio_files(audio_objects)
|
| 426 |
+
final_audio_path = merged_audio_path
|
| 427 |
+
|
| 428 |
+
# Convert to M4B if requested
|
| 429 |
+
if output_format == "m4b" and merged_audio_path:
|
| 430 |
+
yield 95, "Converting to M4B format...", segment_info
|
| 431 |
+
m4b_path = await convert_to_m4b(merged_audio_path, final_filename)
|
| 432 |
+
if m4b_path:
|
| 433 |
+
os.remove(merged_audio_path)
|
| 434 |
+
final_audio_path = m4b_path
|
| 435 |
+
|
| 436 |
+
# Rename to final filename
|
| 437 |
+
if final_audio_path:
|
| 438 |
+
ext = ".m4b" if output_format == "m4b" else ".mp3"
|
| 439 |
+
new_path = os.path.join(os.path.dirname(final_audio_path), f"{final_filename}{ext}")
|
| 440 |
+
shutil.move(final_audio_path, new_path)
|
| 441 |
+
final_audio_path = new_path
|
| 442 |
|
| 443 |
yield 100, "Audio generation complete! ✅", segment_info
|
| 444 |
+
yield final_audio_path, "Done", segment_info
|
| 445 |
return
|
| 446 |
|
| 447 |
# For short texts or single segment, use original method
|
|
|
|
| 453 |
tmp_path = tmp_file.name
|
| 454 |
await communicate.save(tmp_path)
|
| 455 |
|
| 456 |
+
final_audio_path = tmp_path
|
| 457 |
+
|
| 458 |
+
# Convert to M4B if requested
|
| 459 |
+
if output_format == "m4b":
|
| 460 |
+
yield 80, "Converting to M4B format...", None
|
| 461 |
+
m4b_path = await convert_to_m4b(tmp_path, final_filename)
|
| 462 |
+
if m4b_path:
|
| 463 |
+
os.remove(tmp_path)
|
| 464 |
+
final_audio_path = m4b_path
|
| 465 |
+
|
| 466 |
+
# Rename to final filename
|
| 467 |
+
if final_audio_path:
|
| 468 |
+
ext = ".m4b" if output_format == "m4b" else ".mp3"
|
| 469 |
+
new_path = os.path.join(os.path.dirname(final_audio_path), f"{final_filename}{ext}")
|
| 470 |
+
shutil.move(final_audio_path, new_path)
|
| 471 |
+
final_audio_path = new_path
|
| 472 |
+
|
| 473 |
+
logger.info(f"Audio generated at {final_audio_path}")
|
| 474 |
yield 100, "Audio generation complete! ✅", None
|
| 475 |
+
yield final_audio_path, "Done", None
|
| 476 |
|
| 477 |
+
async def tts_interface(text, uploaded_file, voice, rate, volume, pitch, output_format,
|
| 478 |
enable_cleaning, save_cleaned, clean_urls, clean_html,
|
| 479 |
clean_markdown, clean_ads, fix_enc, tidy_ws, del_gutenberg,
|
| 480 |
del_special, wetext_norm):
|
| 481 |
"""Enhanced TTS interface with detailed progress tracking"""
|
| 482 |
+
|
| 483 |
+
# Get output filename from uploaded file (if any)
|
| 484 |
+
output_filename = None
|
| 485 |
+
if uploaded_file is not None:
|
| 486 |
+
output_filename = os.path.splitext(os.path.basename(uploaded_file))[0]
|
| 487 |
+
logger.info(f"Using filename from uploaded file: {output_filename}")
|
| 488 |
+
|
| 489 |
if not text.strip():
|
| 490 |
+
yield None, gr.update(visible=True, value="Please enter text or upload a file."), "No text provided", gr.update(visible=False)
|
| 491 |
return
|
| 492 |
if not voice:
|
| 493 |
yield None, gr.update(visible=False), "Please select a voice.", gr.update(visible=False)
|
|
|
|
| 525 |
# Reset UI
|
| 526 |
yield None, gr.update(value="Starting...", visible=True), "Initializing...", gr.update(visible=False)
|
| 527 |
|
| 528 |
+
async for result in text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_options, output_format, output_filename):
|
| 529 |
if isinstance(result, tuple) and len(result) == 3:
|
| 530 |
# Progress update
|
| 531 |
progress_val, status_msg, segment_info = result
|
|
|
|
| 574 |
with gr.Column():
|
| 575 |
text_input = gr.Textbox(label="Input Text", lines=8, placeholder="Enter your text here... Long texts will be automatically segmented if they exceed 15 minutes of speech time.")
|
| 576 |
|
| 577 |
+
# File upload component
|
| 578 |
+
file_upload = gr.File(
|
| 579 |
+
label="Or Upload File (TXT/EPUB)",
|
| 580 |
+
file_types=[".txt", ".epub"],
|
| 581 |
+
type="filepath"
|
| 582 |
+
)
|
| 583 |
+
|
| 584 |
# Add text analysis info
|
| 585 |
+
text_info = gr.Markdown("**Text Analysis**: Enter text or upload a file to see estimated duration and segment count", visible=True)
|
| 586 |
|
| 587 |
with gr.Accordion("Text Cleaning Settings", open=True):
|
| 588 |
with gr.Row():
|
|
|
|
| 621 |
volume_slider = gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Volume (%)", step=1)
|
| 622 |
pitch_slider = gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch (Hz)", step=1)
|
| 623 |
|
| 624 |
+
# Output format selection
|
| 625 |
+
output_format = gr.Radio(
|
| 626 |
+
choices=["mp3", "m4b"],
|
| 627 |
+
value="mp3",
|
| 628 |
+
label="Output Format",
|
| 629 |
+
info="MP3 is default. M4B is audiobook format (requires ffmpeg)."
|
| 630 |
+
)
|
| 631 |
+
|
| 632 |
generate_btn = gr.Button("Generate Audio", variant="primary")
|
| 633 |
|
| 634 |
with gr.Column():
|
|
|
|
| 650 |
gr.Markdown("Experience the power of Edge TTS for text-to-speech conversion, and explore our advanced Text-to-Video Converter for even more creative possibilities!")
|
| 651 |
|
| 652 |
# Add text analysis function
|
| 653 |
+
def analyze_text(text, uploaded_file):
|
| 654 |
+
# If file is uploaded, parse it first
|
| 655 |
+
if uploaded_file is not None:
|
| 656 |
+
file_text, filename = parse_uploaded_file(uploaded_file)
|
| 657 |
+
if file_text:
|
| 658 |
+
text = file_text
|
| 659 |
+
else:
|
| 660 |
+
return f"**Text Analysis**: Failed to parse uploaded file"
|
| 661 |
+
|
| 662 |
+
if not text or not text.strip():
|
| 663 |
+
return "**Text Analysis**: Enter text or upload a file to see estimated duration and segment count"
|
| 664 |
|
| 665 |
duration = estimate_text_duration(text)
|
| 666 |
word_count = len(text.split())
|
|
|
|
| 673 |
else:
|
| 674 |
return f"**Text Analysis**: {word_count} words, {char_count} characters, ~{duration:.1f} minutes speech time"
|
| 675 |
|
| 676 |
+
# Handle file upload - show preview in text box
|
| 677 |
+
def on_file_upload(uploaded_file):
|
| 678 |
+
if uploaded_file is None:
|
| 679 |
+
return gr.update(), "**Text Analysis**: Enter text or upload a file to see estimated duration and segment count"
|
| 680 |
+
|
| 681 |
+
file_text, filename = parse_uploaded_file(uploaded_file)
|
| 682 |
+
if file_text:
|
| 683 |
+
# Calculate analysis
|
| 684 |
+
duration = estimate_text_duration(file_text)
|
| 685 |
+
word_count = len(file_text.split())
|
| 686 |
+
char_count = len(file_text)
|
| 687 |
+
|
| 688 |
+
if duration > 15:
|
| 689 |
+
segments = split_text_by_paragraphs(file_text)
|
| 690 |
+
segment_count = len(segments)
|
| 691 |
+
analysis = f"**Text Analysis**: {word_count} words, {char_count} characters, ~{duration:.1f} minutes speech time, {segment_count} segments will be generated"
|
| 692 |
+
else:
|
| 693 |
+
analysis = f"**Text Analysis**: {word_count} words, {char_count} characters, ~{duration:.1f} minutes speech time"
|
| 694 |
+
|
| 695 |
+
return gr.update(value=file_text), analysis
|
| 696 |
+
else:
|
| 697 |
+
return gr.update(), "**Text Analysis**: Failed to parse uploaded file"
|
| 698 |
+
|
| 699 |
# Update text analysis when text changes
|
| 700 |
text_input.change(
|
| 701 |
fn=analyze_text,
|
| 702 |
+
inputs=[text_input, file_upload],
|
| 703 |
outputs=[text_info]
|
| 704 |
)
|
| 705 |
+
|
| 706 |
+
# Update text box and analysis when file is uploaded
|
| 707 |
+
file_upload.change(
|
| 708 |
+
fn=on_file_upload,
|
| 709 |
+
inputs=[file_upload],
|
| 710 |
+
outputs=[text_input, text_info]
|
| 711 |
+
)
|
| 712 |
|
| 713 |
generate_btn.click(
|
| 714 |
fn=tts_interface,
|
| 715 |
inputs=[
|
| 716 |
+
text_input, file_upload, voice_dropdown, rate_slider, volume_slider, pitch_slider,
|
| 717 |
+
output_format, enable_cleaning, save_cleaned, clean_urls, clean_html,
|
| 718 |
clean_markdown, clean_ads, fix_enc, tidy_ws, del_gutenberg,
|
| 719 |
del_special, wetext_norm
|
| 720 |
],
|
requirements.txt
CHANGED
|
@@ -1,3 +1,6 @@
|
|
| 1 |
edge_tts==7.0.0
|
| 2 |
gradio>=5.0.0
|
| 3 |
pydub>=0.25.1
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
edge_tts==7.0.0
|
| 2 |
gradio>=5.0.0
|
| 3 |
pydub>=0.25.1
|
| 4 |
+
ebooklib>=0.18
|
| 5 |
+
beautifulsoup4>=4.12.0
|
| 6 |
+
chardet>=5.0.0
|