cs2764 commited on
Commit
3c74b9d
·
verified ·
1 Parent(s): 11cf4ef

Add file upload

Browse files
Files changed (2) hide show
  1. app.py +266 -14
  2. requirements.txt +3 -0
app.py CHANGED
@@ -4,6 +4,7 @@ import asyncio
4
  import tempfile
5
  import os
6
  import re
 
7
  from pydub import AudioSegment
8
  import math
9
  import time
@@ -11,6 +12,24 @@ from datetime import datetime, timedelta
11
  import logging
12
  from text_cleaning import TextCleaner
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  # Configure logging
15
  logging.basicConfig(
16
  level=logging.INFO,
@@ -21,6 +40,137 @@ logging.basicConfig(
21
  )
22
  logger = logging.getLogger(__name__)
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  async def get_voices():
25
  voices = await edge_tts.list_voices()
26
  return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
@@ -190,7 +340,7 @@ async def merge_audio_files(audio_paths):
190
  logger.info(f"Merged audio saved to {merged_path} (Total size: {total_size / 1024 / 1024:.2f} MB)")
191
  return merged_path
192
 
193
- async def text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_options=None):
194
  """Generate speech with detailed progress tracking via generator"""
195
  if not text.strip():
196
  yield None, "Please enter text to convert.", None
@@ -228,6 +378,15 @@ async def text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_op
228
  yield 0, "Starting text processing...", None
229
  logger.info(f"Starting TTS for text with estimated duration: {estimated_duration:.2f}m")
230
 
 
 
 
 
 
 
 
 
 
231
  if estimated_duration > 15: # If longer than 15 minutes, split into segments
232
  segments = split_text_by_paragraphs(text)
233
  total_segments = len(segments)
@@ -264,9 +423,25 @@ async def text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_op
264
 
265
  # Merge all audio objects
266
  merged_audio_path = await merge_audio_files(audio_objects)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
 
268
  yield 100, "Audio generation complete! ✅", segment_info
269
- yield merged_audio_path, "Done", segment_info
270
  return
271
 
272
  # For short texts or single segment, use original method
@@ -278,17 +453,41 @@ async def text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_op
278
  tmp_path = tmp_file.name
279
  await communicate.save(tmp_path)
280
 
281
- logger.info(f"Audio generated at {tmp_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  yield 100, "Audio generation complete! ✅", None
283
- yield tmp_path, "Done", None
284
 
285
- async def tts_interface(text, voice, rate, volume, pitch,
286
  enable_cleaning, save_cleaned, clean_urls, clean_html,
287
  clean_markdown, clean_ads, fix_enc, tidy_ws, del_gutenberg,
288
  del_special, wetext_norm):
289
  """Enhanced TTS interface with detailed progress tracking"""
 
 
 
 
 
 
 
290
  if not text.strip():
291
- yield None, gr.update(visible=False), "Please enter text.", gr.update(visible=False)
292
  return
293
  if not voice:
294
  yield None, gr.update(visible=False), "Please select a voice.", gr.update(visible=False)
@@ -326,7 +525,7 @@ async def tts_interface(text, voice, rate, volume, pitch,
326
  # Reset UI
327
  yield None, gr.update(value="Starting...", visible=True), "Initializing...", gr.update(visible=False)
328
 
329
- async for result in text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_options):
330
  if isinstance(result, tuple) and len(result) == 3:
331
  # Progress update
332
  progress_val, status_msg, segment_info = result
@@ -375,8 +574,15 @@ async def create_demo():
375
  with gr.Column():
376
  text_input = gr.Textbox(label="Input Text", lines=8, placeholder="Enter your text here... Long texts will be automatically segmented if they exceed 15 minutes of speech time.")
377
 
 
 
 
 
 
 
 
378
  # Add text analysis info
379
- text_info = gr.Markdown("**Text Analysis**: Enter text to see estimated duration and segment count", visible=True)
380
 
381
  with gr.Accordion("Text Cleaning Settings", open=True):
382
  with gr.Row():
@@ -415,6 +621,14 @@ async def create_demo():
415
  volume_slider = gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Volume (%)", step=1)
416
  pitch_slider = gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch (Hz)", step=1)
417
 
 
 
 
 
 
 
 
 
418
  generate_btn = gr.Button("Generate Audio", variant="primary")
419
 
420
  with gr.Column():
@@ -436,9 +650,17 @@ async def create_demo():
436
  gr.Markdown("Experience the power of Edge TTS for text-to-speech conversion, and explore our advanced Text-to-Video Converter for even more creative possibilities!")
437
 
438
  # Add text analysis function
439
- def analyze_text(text):
440
- if not text.strip():
441
- return "**Text Analysis**: Enter text to see estimated duration and segment count"
 
 
 
 
 
 
 
 
442
 
443
  duration = estimate_text_duration(text)
444
  word_count = len(text.split())
@@ -451,18 +673,48 @@ async def create_demo():
451
  else:
452
  return f"**Text Analysis**: {word_count} words, {char_count} characters, ~{duration:.1f} minutes speech time"
453
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
454
  # Update text analysis when text changes
455
  text_input.change(
456
  fn=analyze_text,
457
- inputs=[text_input],
458
  outputs=[text_info]
459
  )
 
 
 
 
 
 
 
460
 
461
  generate_btn.click(
462
  fn=tts_interface,
463
  inputs=[
464
- text_input, voice_dropdown, rate_slider, volume_slider, pitch_slider,
465
- enable_cleaning, save_cleaned, clean_urls, clean_html,
466
  clean_markdown, clean_ads, fix_enc, tidy_ws, del_gutenberg,
467
  del_special, wetext_norm
468
  ],
 
4
  import tempfile
5
  import os
6
  import re
7
+ import shutil
8
  from pydub import AudioSegment
9
  import math
10
  import time
 
12
  import logging
13
  from text_cleaning import TextCleaner
14
 
15
+ # EPUB parsing
16
+ try:
17
+ import ebooklib
18
+ from ebooklib import epub
19
+ from bs4 import BeautifulSoup
20
+ EPUB_SUPPORT = True
21
+ except ImportError:
22
+ EPUB_SUPPORT = False
23
+ logging.warning("ebooklib or beautifulsoup4 not installed. EPUB support disabled.")
24
+
25
+ # Encoding detection
26
+ try:
27
+ import chardet
28
+ CHARDET_SUPPORT = True
29
+ except ImportError:
30
+ CHARDET_SUPPORT = False
31
+ logging.warning("chardet not installed. Encoding detection will use fallback method.")
32
+
33
  # Configure logging
34
  logging.basicConfig(
35
  level=logging.INFO,
 
40
  )
41
  logger = logging.getLogger(__name__)
42
 
43
+ def detect_file_encoding(file_path):
44
+ """Detect file encoding using chardet or fallback method"""
45
+ if CHARDET_SUPPORT:
46
+ with open(file_path, 'rb') as f:
47
+ raw_data = f.read()
48
+ result = chardet.detect(raw_data)
49
+ encoding = result['encoding']
50
+ confidence = result['confidence']
51
+ logger.info(f"Detected encoding: {encoding} (confidence: {confidence:.2%})")
52
+
53
+ # Handle common encoding aliases
54
+ if encoding:
55
+ encoding_lower = encoding.lower()
56
+ # Map common aliases to standard names
57
+ encoding_map = {
58
+ 'gb2312': 'gbk', # GBK is superset of GB2312
59
+ 'gb18030': 'gb18030',
60
+ 'ascii': 'utf-8', # ASCII is subset of UTF-8
61
+ 'iso-8859-1': 'latin-1',
62
+ 'windows-1252': 'cp1252',
63
+ }
64
+ encoding = encoding_map.get(encoding_lower, encoding)
65
+ return encoding
66
+ else:
67
+ # Fallback: try common encodings
68
+ return None
69
+
70
+ def read_text_file_with_encoding(file_path):
71
+ """Read text file with automatic encoding detection"""
72
+ # First try chardet detection
73
+ detected_encoding = detect_file_encoding(file_path)
74
+
75
+ # Priority list of encodings to try
76
+ # Common encodings for Chinese: UTF-8, GBK, GB2312, GB18030
77
+ # Common encodings for English/Western: UTF-8, Latin-1, CP1252
78
+ encodings_to_try = []
79
+
80
+ if detected_encoding:
81
+ encodings_to_try.append(detected_encoding)
82
+
83
+ # Add common encodings as fallback
84
+ encodings_to_try.extend([
85
+ 'utf-8',
86
+ 'utf-8-sig', # UTF-8 with BOM
87
+ 'gbk', # Chinese (simplified)
88
+ 'gb18030', # Chinese (extended)
89
+ 'big5', # Chinese (traditional)
90
+ 'utf-16',
91
+ 'latin-1', # Western European
92
+ 'cp1252', # Windows Western
93
+ 'shift_jis', # Japanese
94
+ 'euc-kr', # Korean
95
+ ])
96
+
97
+ # Remove duplicates while preserving order
98
+ seen = set()
99
+ unique_encodings = []
100
+ for enc in encodings_to_try:
101
+ if enc and enc.lower() not in seen:
102
+ seen.add(enc.lower())
103
+ unique_encodings.append(enc)
104
+
105
+ last_error = None
106
+ for encoding in unique_encodings:
107
+ try:
108
+ with open(file_path, 'r', encoding=encoding) as f:
109
+ text = f.read()
110
+ # Validate: check if text contains too many replacement characters
111
+ if text.count('\ufffd') > len(text) * 0.1: # More than 10% replacement chars
112
+ logger.debug(f"Encoding {encoding} produced too many replacement characters, trying next...")
113
+ continue
114
+ logger.info(f"Successfully read file with encoding: {encoding}")
115
+ return text
116
+ except (UnicodeDecodeError, LookupError) as e:
117
+ last_error = e
118
+ logger.debug(f"Failed to decode with {encoding}: {e}")
119
+ continue
120
+
121
+ logger.error(f"Failed to decode file with any encoding. Last error: {last_error}")
122
+ return None
123
+
124
+ def parse_uploaded_file(file_path):
125
+ """Parse uploaded txt or epub file and return text content and filename"""
126
+ if file_path is None:
127
+ return None, None
128
+
129
+ filename = os.path.splitext(os.path.basename(file_path))[0]
130
+ ext = os.path.splitext(file_path)[1].lower()
131
+
132
+ if ext == '.txt':
133
+ text = read_text_file_with_encoding(file_path)
134
+ if text:
135
+ logger.info(f"Parsed TXT file: {filename}, {len(text)} chars")
136
+ return text, filename
137
+ else:
138
+ logger.error(f"Failed to decode TXT file: {filename}")
139
+ return None, filename
140
+
141
+ elif ext == '.epub':
142
+ if not EPUB_SUPPORT:
143
+ logger.error("EPUB support not available")
144
+ return None, filename
145
+ try:
146
+ book = epub.read_epub(file_path)
147
+ text_parts = []
148
+ for item in book.get_items():
149
+ if item.get_type() == ebooklib.ITEM_DOCUMENT:
150
+ soup = BeautifulSoup(item.get_content(), 'html.parser')
151
+ text_parts.append(soup.get_text(separator='\n'))
152
+ text = '\n\n'.join(text_parts)
153
+ logger.info(f"Parsed EPUB file: {filename}, {len(text)} chars")
154
+ return text, filename
155
+ except Exception as e:
156
+ logger.error(f"Failed to parse EPUB: {e}")
157
+ return None, filename
158
+
159
+ return None, None
160
+
161
+ async def convert_to_m4b(mp3_path, output_filename):
162
+ """Convert MP3 to M4B format using pydub"""
163
+ try:
164
+ audio = AudioSegment.from_mp3(mp3_path)
165
+ m4b_path = tempfile.NamedTemporaryFile(delete=False, suffix=".m4b").name
166
+ # Export as m4a (m4b is essentially m4a with audiobook metadata)
167
+ audio.export(m4b_path, format="ipod", codec="aac")
168
+ logger.info(f"Converted to M4B: {m4b_path}")
169
+ return m4b_path
170
+ except Exception as e:
171
+ logger.error(f"Failed to convert to M4B: {e}")
172
+ return None
173
+
174
  async def get_voices():
175
  voices = await edge_tts.list_voices()
176
  return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
 
340
  logger.info(f"Merged audio saved to {merged_path} (Total size: {total_size / 1024 / 1024:.2f} MB)")
341
  return merged_path
342
 
343
+ async def text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_options=None, output_format="mp3", output_filename=None):
344
  """Generate speech with detailed progress tracking via generator"""
345
  if not text.strip():
346
  yield None, "Please enter text to convert.", None
 
378
  yield 0, "Starting text processing...", None
379
  logger.info(f"Starting TTS for text with estimated duration: {estimated_duration:.2f}m")
380
 
381
+ # Generate output filename with timestamp
382
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
383
+ if output_filename:
384
+ final_filename = f"{output_filename}_{timestamp}"
385
+ else:
386
+ final_filename = f"audio_{timestamp}"
387
+
388
+ final_audio_path = None
389
+
390
  if estimated_duration > 15: # If longer than 15 minutes, split into segments
391
  segments = split_text_by_paragraphs(text)
392
  total_segments = len(segments)
 
423
 
424
  # Merge all audio objects
425
  merged_audio_path = await merge_audio_files(audio_objects)
426
+ final_audio_path = merged_audio_path
427
+
428
+ # Convert to M4B if requested
429
+ if output_format == "m4b" and merged_audio_path:
430
+ yield 95, "Converting to M4B format...", segment_info
431
+ m4b_path = await convert_to_m4b(merged_audio_path, final_filename)
432
+ if m4b_path:
433
+ os.remove(merged_audio_path)
434
+ final_audio_path = m4b_path
435
+
436
+ # Rename to final filename
437
+ if final_audio_path:
438
+ ext = ".m4b" if output_format == "m4b" else ".mp3"
439
+ new_path = os.path.join(os.path.dirname(final_audio_path), f"{final_filename}{ext}")
440
+ shutil.move(final_audio_path, new_path)
441
+ final_audio_path = new_path
442
 
443
  yield 100, "Audio generation complete! ✅", segment_info
444
+ yield final_audio_path, "Done", segment_info
445
  return
446
 
447
  # For short texts or single segment, use original method
 
453
  tmp_path = tmp_file.name
454
  await communicate.save(tmp_path)
455
 
456
+ final_audio_path = tmp_path
457
+
458
+ # Convert to M4B if requested
459
+ if output_format == "m4b":
460
+ yield 80, "Converting to M4B format...", None
461
+ m4b_path = await convert_to_m4b(tmp_path, final_filename)
462
+ if m4b_path:
463
+ os.remove(tmp_path)
464
+ final_audio_path = m4b_path
465
+
466
+ # Rename to final filename
467
+ if final_audio_path:
468
+ ext = ".m4b" if output_format == "m4b" else ".mp3"
469
+ new_path = os.path.join(os.path.dirname(final_audio_path), f"{final_filename}{ext}")
470
+ shutil.move(final_audio_path, new_path)
471
+ final_audio_path = new_path
472
+
473
+ logger.info(f"Audio generated at {final_audio_path}")
474
  yield 100, "Audio generation complete! ✅", None
475
+ yield final_audio_path, "Done", None
476
 
477
+ async def tts_interface(text, uploaded_file, voice, rate, volume, pitch, output_format,
478
  enable_cleaning, save_cleaned, clean_urls, clean_html,
479
  clean_markdown, clean_ads, fix_enc, tidy_ws, del_gutenberg,
480
  del_special, wetext_norm):
481
  """Enhanced TTS interface with detailed progress tracking"""
482
+
483
+ # Get output filename from uploaded file (if any)
484
+ output_filename = None
485
+ if uploaded_file is not None:
486
+ output_filename = os.path.splitext(os.path.basename(uploaded_file))[0]
487
+ logger.info(f"Using filename from uploaded file: {output_filename}")
488
+
489
  if not text.strip():
490
+ yield None, gr.update(visible=True, value="Please enter text or upload a file."), "No text provided", gr.update(visible=False)
491
  return
492
  if not voice:
493
  yield None, gr.update(visible=False), "Please select a voice.", gr.update(visible=False)
 
525
  # Reset UI
526
  yield None, gr.update(value="Starting...", visible=True), "Initializing...", gr.update(visible=False)
527
 
528
+ async for result in text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_options, output_format, output_filename):
529
  if isinstance(result, tuple) and len(result) == 3:
530
  # Progress update
531
  progress_val, status_msg, segment_info = result
 
574
  with gr.Column():
575
  text_input = gr.Textbox(label="Input Text", lines=8, placeholder="Enter your text here... Long texts will be automatically segmented if they exceed 15 minutes of speech time.")
576
 
577
+ # File upload component
578
+ file_upload = gr.File(
579
+ label="Or Upload File (TXT/EPUB)",
580
+ file_types=[".txt", ".epub"],
581
+ type="filepath"
582
+ )
583
+
584
  # Add text analysis info
585
+ text_info = gr.Markdown("**Text Analysis**: Enter text or upload a file to see estimated duration and segment count", visible=True)
586
 
587
  with gr.Accordion("Text Cleaning Settings", open=True):
588
  with gr.Row():
 
621
  volume_slider = gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Volume (%)", step=1)
622
  pitch_slider = gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch (Hz)", step=1)
623
 
624
+ # Output format selection
625
+ output_format = gr.Radio(
626
+ choices=["mp3", "m4b"],
627
+ value="mp3",
628
+ label="Output Format",
629
+ info="MP3 is default. M4B is audiobook format (requires ffmpeg)."
630
+ )
631
+
632
  generate_btn = gr.Button("Generate Audio", variant="primary")
633
 
634
  with gr.Column():
 
650
  gr.Markdown("Experience the power of Edge TTS for text-to-speech conversion, and explore our advanced Text-to-Video Converter for even more creative possibilities!")
651
 
652
  # Add text analysis function
653
+ def analyze_text(text, uploaded_file):
654
+ # If file is uploaded, parse it first
655
+ if uploaded_file is not None:
656
+ file_text, filename = parse_uploaded_file(uploaded_file)
657
+ if file_text:
658
+ text = file_text
659
+ else:
660
+ return f"**Text Analysis**: Failed to parse uploaded file"
661
+
662
+ if not text or not text.strip():
663
+ return "**Text Analysis**: Enter text or upload a file to see estimated duration and segment count"
664
 
665
  duration = estimate_text_duration(text)
666
  word_count = len(text.split())
 
673
  else:
674
  return f"**Text Analysis**: {word_count} words, {char_count} characters, ~{duration:.1f} minutes speech time"
675
 
676
+ # Handle file upload - show preview in text box
677
+ def on_file_upload(uploaded_file):
678
+ if uploaded_file is None:
679
+ return gr.update(), "**Text Analysis**: Enter text or upload a file to see estimated duration and segment count"
680
+
681
+ file_text, filename = parse_uploaded_file(uploaded_file)
682
+ if file_text:
683
+ # Calculate analysis
684
+ duration = estimate_text_duration(file_text)
685
+ word_count = len(file_text.split())
686
+ char_count = len(file_text)
687
+
688
+ if duration > 15:
689
+ segments = split_text_by_paragraphs(file_text)
690
+ segment_count = len(segments)
691
+ analysis = f"**Text Analysis**: {word_count} words, {char_count} characters, ~{duration:.1f} minutes speech time, {segment_count} segments will be generated"
692
+ else:
693
+ analysis = f"**Text Analysis**: {word_count} words, {char_count} characters, ~{duration:.1f} minutes speech time"
694
+
695
+ return gr.update(value=file_text), analysis
696
+ else:
697
+ return gr.update(), "**Text Analysis**: Failed to parse uploaded file"
698
+
699
  # Update text analysis when text changes
700
  text_input.change(
701
  fn=analyze_text,
702
+ inputs=[text_input, file_upload],
703
  outputs=[text_info]
704
  )
705
+
706
+ # Update text box and analysis when file is uploaded
707
+ file_upload.change(
708
+ fn=on_file_upload,
709
+ inputs=[file_upload],
710
+ outputs=[text_input, text_info]
711
+ )
712
 
713
  generate_btn.click(
714
  fn=tts_interface,
715
  inputs=[
716
+ text_input, file_upload, voice_dropdown, rate_slider, volume_slider, pitch_slider,
717
+ output_format, enable_cleaning, save_cleaned, clean_urls, clean_html,
718
  clean_markdown, clean_ads, fix_enc, tidy_ws, del_gutenberg,
719
  del_special, wetext_norm
720
  ],
requirements.txt CHANGED
@@ -1,3 +1,6 @@
1
  edge_tts==7.0.0
2
  gradio>=5.0.0
3
  pydub>=0.25.1
 
 
 
 
1
  edge_tts==7.0.0
2
  gradio>=5.0.0
3
  pydub>=0.25.1
4
+ ebooklib>=0.18
5
+ beautifulsoup4>=4.12.0
6
+ chardet>=5.0.0