cs2764 commited on
Commit
f9ed71b
·
verified ·
1 Parent(s): d098a55

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +480 -476
  2. text_cleaning.py +44 -0
app.py CHANGED
@@ -1,476 +1,480 @@
1
- import gradio as gr
2
- import edge_tts
3
- import asyncio
4
- import tempfile
5
- import os
6
- import re
7
- from pydub import AudioSegment
8
- import math
9
- import time
10
- from datetime import datetime, timedelta
11
- import logging
12
- from text_cleaning import TextCleaner
13
-
14
- # Configure logging
15
- logging.basicConfig(
16
- level=logging.INFO,
17
- format='%(asctime)s - %(levelname)s - %(message)s',
18
- handlers=[
19
- logging.StreamHandler()
20
- ]
21
- )
22
- logger = logging.getLogger(__name__)
23
-
24
- async def get_voices():
25
- voices = await edge_tts.list_voices()
26
- return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
27
-
28
- def format_time_remaining(seconds):
29
- """Format seconds into human readable time remaining"""
30
- if seconds < 60:
31
- return f"{int(seconds)}s"
32
- elif seconds < 3600:
33
- minutes = seconds / 60
34
- return f"{minutes:.1f}m"
35
- else:
36
- hours = seconds / 3600
37
- return f"{hours:.1f}h"
38
-
39
- def calculate_eta(start_time, completed_items, total_items):
40
- """Calculate estimated time remaining"""
41
- if completed_items == 0:
42
- return "Calculating..."
43
-
44
- elapsed_time = time.time() - start_time
45
- time_per_item = elapsed_time / completed_items
46
- remaining_items = total_items - completed_items
47
- remaining_time = time_per_item * remaining_items
48
-
49
- return format_time_remaining(remaining_time)
50
-
51
- def estimate_text_duration(text):
52
- """Estimate speech duration in minutes based on text length"""
53
- # Simple heuristic:
54
- # For English (space-separated), ~150 words/min
55
- # For Chinese (no spaces), ~300 chars/min
56
- # We'll use a hybrid approach: count spaces to guess if it's space-separated.
57
-
58
- if not text:
59
- return 0
60
-
61
- space_count = text.count(' ')
62
- total_len = len(text)
63
-
64
- # If spaces are < 10% of length, assume non-space-separated (like Chinese)
65
- if space_count / total_len < 0.1:
66
- # Approx 300 chars per minute for Chinese
67
- duration = total_len / 300
68
- # logger.debug(f"Estimated duration (char-based): {duration:.2f} min ({total_len} chars)")
69
- else:
70
- # Approx 150 words per minute for English
71
- word_count = len(text.split())
72
- duration = word_count / 150
73
- # logger.debug(f"Estimated duration (word-based): {duration:.2f} min ({word_count} words)")
74
-
75
- return duration
76
-
77
- def split_text_by_paragraphs(text, max_duration_minutes=5, max_chars=500):
78
- """Split text into segments that won't exceed limit with safety margin"""
79
- max_duration = max_duration_minutes
80
- estimated_duration = estimate_text_duration(text)
81
-
82
- logger.info(f"Checking segmentation: Duration={estimated_duration:.2f}m, Chars={len(text)}, Limit={max_duration}m/{max_chars}chars")
83
-
84
- if estimated_duration <= max_duration and len(text) <= max_chars:
85
- return [text]
86
-
87
- logger.info(f"Text exceeds limits. Splitting...")
88
-
89
- # Split by paragraphs first
90
- paragraphs = text.split('\n\n')
91
- segments = []
92
- current_segment = ""
93
-
94
- for paragraph in paragraphs:
95
- paragraph_duration = estimate_text_duration(paragraph)
96
-
97
- # If single paragraph is too long, split by sentences
98
- # Improved regex to include Chinese punctuation
99
- if paragraph_duration > max_duration or len(paragraph) > max_chars:
100
- sentences = re.split(r'([.!?。!?]+)', paragraph)
101
- # Re-attach delimiters to sentences
102
- real_sentences = []
103
- for i in range(0, len(sentences) - 1, 2):
104
- real_sentences.append(sentences[i] + sentences[i+1])
105
- if len(sentences) % 2 == 1 and sentences[-1]:
106
- real_sentences.append(sentences[-1])
107
-
108
- for sentence in real_sentences:
109
- sentence = sentence.strip()
110
- if not sentence:
111
- continue
112
-
113
- # Check both duration and char count
114
- if (estimate_text_duration(current_segment + sentence) > max_duration or
115
- len(current_segment + sentence) > max_chars) and current_segment:
116
- segments.append(current_segment.strip())
117
- current_segment = sentence
118
- else:
119
- current_segment += sentence
120
- else:
121
- if (estimate_text_duration(current_segment + paragraph) > max_duration or
122
- len(current_segment + paragraph) > max_chars) and current_segment:
123
- segments.append(current_segment.strip())
124
- current_segment = paragraph + "\n\n"
125
- else:
126
- current_segment += paragraph + "\n\n"
127
-
128
- if current_segment.strip():
129
- segments.append(current_segment.strip())
130
-
131
- logger.info(f"Split text into {len(segments)} segments.")
132
- return segments
133
-
134
- import io
135
-
136
- async def generate_audio_segment(text_segment, voice_short_name, rate_str, volume_str, pitch_str, segment_index):
137
- """Generate audio for a single text segment and return as BytesIO"""
138
- logger.info(f"Generating segment {segment_index}...")
139
- communicate = edge_tts.Communicate(text_segment, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str)
140
-
141
- audio_data = io.BytesIO()
142
- try:
143
- async for chunk in communicate.stream():
144
- if chunk["type"] == "audio":
145
- audio_data.write(chunk["data"])
146
- except Exception as e:
147
- logger.error(f"Error generating segment {segment_index} (Length: {len(text_segment)} chars): {e}")
148
- raise gr.Error(f"Error generating segment {segment_index}: {e}")
149
-
150
- audio_data.seek(0)
151
-
152
- # Verify segment duration
153
- try:
154
- # Make a copy for verification so we don't consume the main buffer
155
- verify_buffer = io.BytesIO(audio_data.getvalue())
156
- seg_audio = AudioSegment.from_mp3(verify_buffer)
157
- duration_min = len(seg_audio) / 1000 / 60
158
- logger.info(f"Segment {segment_index} generated in memory (Duration: {duration_min:.2f} min)")
159
- except Exception as e:
160
- logger.error(f"Error checking segment {segment_index} duration: {e}")
161
-
162
- audio_data.seek(0)
163
- return audio_data
164
-
165
- async def merge_audio_files(audio_objects):
166
- """Merge multiple audio BytesIO objects into one file"""
167
- if not audio_objects:
168
- return None
169
-
170
- logger.info(f"Merging {len(audio_objects)} audio segments...")
171
-
172
- # Load and merge audio segments
173
- combined = AudioSegment.empty()
174
- for i, audio_obj in enumerate(audio_objects):
175
- try:
176
- audio_obj.seek(0)
177
- segment = AudioSegment.from_mp3(audio_obj)
178
- combined += segment
179
- # Explicitly close/clear the BytesIO object to free memory
180
- audio_obj.close()
181
- except Exception as e:
182
- logger.error(f"Error merging segment {i+1}: {e}")
183
-
184
- # Save merged audio to a single temporary file
185
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
186
- merged_path = tmp_file.name
187
- combined.export(merged_path, format="mp3")
188
-
189
- total_duration_min = len(combined) / 1000 / 60
190
- logger.info(f"Merged audio saved to {merged_path} (Total Duration: {total_duration_min:.2f} min)")
191
- return merged_path
192
-
193
- async def text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_options=None):
194
- """Generate speech with detailed progress tracking via generator"""
195
- if not text.strip():
196
- yield None, "Please enter text to convert.", None
197
- return
198
- if not voice:
199
- yield None, "Please select a voice.", None
200
- return
201
-
202
- # Apply text cleaning if enabled
203
- if cleaning_options and cleaning_options.get('enable_cleaning', False):
204
- yield 0, "Cleaning text...", None
205
- # original_text = text # Unused
206
- text = TextCleaner.clean_text(text, cleaning_options)
207
-
208
- if cleaning_options.get('save_cleaned', False):
209
- # Create a filename based on timestamp or first few words
210
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
211
- filename = f"text_{timestamp}.txt"
212
- saved_path = TextCleaner.save_cleaned_text(text, filename)
213
- if saved_path:
214
- logger.info(f"Saved cleaned text to {saved_path}")
215
-
216
- if not text.strip():
217
- yield None, "Text cleaning resulted in empty text.", None
218
- return
219
-
220
- voice_short_name = voice.split(" - ")[0]
221
- rate_str = f"{rate:+d}%"
222
- volume_str = f"{volume:+d}%"
223
- pitch_str = f"{pitch:+d}Hz"
224
-
225
- # Check if text is too long and needs segmentation
226
- estimated_duration = estimate_text_duration(text)
227
-
228
- yield 0, "Starting text processing...", None
229
- logger.info(f"Starting TTS for text with estimated duration: {estimated_duration:.2f}m")
230
-
231
- if estimated_duration > 15: # If longer than 15 minutes, split into segments
232
- segments = split_text_by_paragraphs(text)
233
- total_segments = len(segments)
234
-
235
- segment_info = f"Text split into {total_segments} segments. Total estimated duration: {estimated_duration:.1f} min"
236
- yield 5, segment_info, segment_info
237
-
238
- if total_segments > 1:
239
- # Generate audio for each segment with progress tracking
240
- audio_objects = []
241
- start_time = time.time()
242
-
243
- for i, segment in enumerate(segments):
244
- if segment.strip():
245
- segment_duration = estimate_text_duration(segment)
246
-
247
- progress = 10 + (80 * i / total_segments) # 10% to 90%
248
- eta = calculate_eta(start_time, i, total_segments)
249
- status_msg = (
250
- f"Generating segment {i+1}/{total_segments}...\n"
251
- f"Segment duration: {segment_duration:.1f} min\n"
252
- f"ETA: {eta}"
253
- )
254
- logger.info(f"Progress: {status_msg.replace(chr(10), ', ')}")
255
- yield progress, status_msg, segment_info
256
-
257
- # Generate to memory
258
- audio_obj = await generate_audio_segment(
259
- segment, voice_short_name, rate_str, volume_str, pitch_str, i+1
260
- )
261
- audio_objects.append(audio_obj)
262
-
263
- yield 90, "Merging audio files...", segment_info
264
-
265
- # Merge all audio objects
266
- merged_audio_path = await merge_audio_files(audio_objects)
267
-
268
- yield 100, "Audio generation complete! ✅", segment_info
269
- yield merged_audio_path, "Done", segment_info
270
- return
271
-
272
- # For short texts or single segment, use original method
273
- yield 50, "Generating audio...", None
274
-
275
- logger.info("Generating single segment audio...")
276
- communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str)
277
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
278
- tmp_path = tmp_file.name
279
- await communicate.save(tmp_path)
280
-
281
- logger.info(f"Audio generated at {tmp_path}")
282
- yield 100, "Audio generation complete! ✅", None
283
- yield tmp_path, "Done", None
284
-
285
- async def tts_interface(text, voice, rate, volume, pitch,
286
- enable_cleaning, save_cleaned, clean_urls, clean_html,
287
- clean_ads, fix_enc, tidy_ws, del_gutenberg,
288
- del_special, wetext_norm):
289
- """Enhanced TTS interface with detailed progress tracking"""
290
- if not text.strip():
291
- yield None, gr.update(visible=False), "Please enter text.", gr.update(visible=False)
292
- return
293
- if not voice:
294
- yield None, gr.update(visible=False), "Please select a voice.", gr.update(visible=False)
295
- return
296
-
297
- # Prepare cleaning options
298
- cleaning_options = {
299
- 'enable_cleaning': enable_cleaning,
300
- 'save_cleaned': save_cleaned,
301
- 'remove_urls': clean_urls,
302
- 'remove_html': clean_html,
303
- 'filter_ads': clean_ads,
304
- 'fix_encoding': fix_enc,
305
- 'tidy_whitespace': tidy_ws,
306
- 'remove_gutenberg': del_gutenberg,
307
- 'remove_special_chars': del_special,
308
- 'wetext_normalization': wetext_norm
309
- }
310
-
311
- # We need to clean text here first to estimate duration correctly?
312
- # Or let the generator handle it. The generator handles it, but estimation might be off.
313
- # Ideally we clean first if enabled, then estimate.
314
-
315
- working_text = text
316
- if enable_cleaning:
317
- working_text = TextCleaner.clean_text(text, cleaning_options)
318
- if save_cleaned:
319
- # We'll let the generator save it to avoid double saving or complex logic here,
320
- # but we need to pass the options.
321
- pass
322
-
323
- estimated_duration = estimate_text_duration(working_text)
324
-
325
- # Reset UI
326
- yield None, gr.update(value="Starting...", visible=True), "Initializing...", gr.update(visible=False)
327
-
328
- async for result in text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_options):
329
- if isinstance(result, tuple) and len(result) == 3:
330
- # Progress update
331
- progress_val, status_msg, segment_info = result
332
-
333
- if isinstance(progress_val, (int, float)):
334
- # It's a progress update
335
- segment_update = gr.update(value=segment_info, visible=True) if segment_info else gr.update(visible=False)
336
- yield None, gr.update(value=status_msg, visible=True), status_msg, segment_update
337
- else:
338
- # It's the final result (path, msg, info)
339
- audio_path = progress_val
340
- yield audio_path, gr.update(value="Complete!", visible=True), "Generation Complete", gr.update(visible=True)
341
-
342
- async def create_demo():
343
- voices = await get_voices()
344
-
345
- description = """
346
- Convert text to speech using Microsoft Edge TTS. Adjust speech rate and pitch: 0 is default, positive values increase, negative values decrease.
347
-
348
- 🎥 **Exciting News: Introducing our Text-to-Video Converter!** 🎥
349
-
350
- Take your content creation to the next level with our cutting-edge Text-to-Video Converter!
351
- Transform your words into stunning, professional-quality videos in just a few clicks.
352
-
353
- ✨ Features:
354
- Convert text to engaging videos with customizable visuals
355
- Choose from 40+ languages and 300+ voices
356
- Perfect for creating audiobooks, storytelling, and language learning materials
357
- Ideal for educators, content creators, and language enthusiasts
358
-
359
- 📝 **Long Text Support**:
360
- Texts longer than 15 minutes will be **automatically segmented** into smaller chunks for processing and then **merged back** into a single high-quality audio file. This ensures stability and allows for unlimited text length!
361
- """
362
-
363
- default_voice = ""
364
- for voice_key in voices.keys():
365
- if "XiaoxiaoNeural" in voice_key:
366
- default_voice = voice_key
367
- break
368
-
369
- with gr.Blocks(title="Edge TTS Text-to-Speech") as demo:
370
- gr.Markdown("# Edge TTS Text-to-Speech")
371
- gr.Markdown(description)
372
-
373
- with gr.Row():
374
- with gr.Column():
375
- text_input = gr.Textbox(label="Input Text", lines=8, placeholder="Enter your text here... Long texts will be automatically segmented if they exceed 15 minutes of speech time.")
376
-
377
- # Add text analysis info
378
- text_info = gr.Markdown("**Text Analysis**: Enter text to see estimated duration and segment count", visible=True)
379
-
380
- with gr.Accordion("Text Cleaning Settings", open=True):
381
- with gr.Row():
382
- enable_cleaning = gr.Checkbox(label="Enable Text Cleaning", value=True)
383
- save_cleaned = gr.Checkbox(label="Save Cleaned Text File", value=True)
384
-
385
- with gr.Group(visible=True) as cleaning_options_group:
386
- with gr.Row():
387
- clean_urls = gr.Checkbox(label="Remove URLs", value=True)
388
- clean_html = gr.Checkbox(label="Remove HTML", value=True)
389
-
390
- with gr.Row():
391
- clean_ads = gr.Checkbox(label="Filter Ads", value=True)
392
- fix_enc = gr.Checkbox(label="Fix Encoding", value=True)
393
-
394
- with gr.Row():
395
- tidy_ws = gr.Checkbox(label="Tidy Whitespace", value=True)
396
- del_gutenberg = gr.Checkbox(label="Remove Project Gutenberg", value=True)
397
-
398
- with gr.Row():
399
- del_special = gr.Checkbox(label="Remove Special Characters", value=True)
400
- wetext_norm = gr.Checkbox(label="Enable WeText Normalization", value=True)
401
-
402
- def toggle_options(enabled):
403
- return gr.update(visible=enabled)
404
-
405
- enable_cleaning.change(fn=toggle_options, inputs=[enable_cleaning], outputs=[cleaning_options_group])
406
-
407
- voice_dropdown = gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice)
408
-
409
- with gr.Row():
410
- rate_slider = gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate (%)", step=1)
411
- volume_slider = gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Volume (%)", step=1)
412
- pitch_slider = gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch (Hz)", step=1)
413
-
414
- generate_btn = gr.Button("Generate Audio", variant="primary")
415
-
416
- with gr.Column():
417
- audio_output = gr.Audio(label="Generated Audio", type="filepath")
418
-
419
- # Progress and status display
420
- with gr.Group():
421
- gr.Markdown("### 📊 Processing Progress")
422
- progress_info = gr.Markdown("Ready, click Generate to start...", visible=True)
423
-
424
- # Processing details
425
- with gr.Accordion("🔍 Processing Details", open=True) as processing_details:
426
- status_output = gr.Markdown("Waiting...", visible=True)
427
-
428
- # Segment information display
429
- with gr.Accordion("📋 Segment Information", open=True) as segment_info:
430
- segment_details = gr.Markdown("Segment details will appear here for long texts", visible=True)
431
-
432
- gr.Markdown("Experience the power of Edge TTS for text-to-speech conversion, and explore our advanced Text-to-Video Converter for even more creative possibilities!")
433
-
434
- # Add text analysis function
435
- def analyze_text(text):
436
- if not text.strip():
437
- return "**Text Analysis**: Enter text to see estimated duration and segment count"
438
-
439
- duration = estimate_text_duration(text)
440
- word_count = len(text.split())
441
- char_count = len(text)
442
-
443
- if duration > 15:
444
- segments = split_text_by_paragraphs(text)
445
- segment_count = len(segments)
446
- return f"**Text Analysis**: {word_count} words, {char_count} characters, ~{duration:.1f} minutes speech time, {segment_count} segments will be generated"
447
- else:
448
- return f"**Text Analysis**: {word_count} words, {char_count} characters, ~{duration:.1f} minutes speech time"
449
-
450
- # Update text analysis when text changes
451
- text_input.change(
452
- fn=analyze_text,
453
- inputs=[text_input],
454
- outputs=[text_info]
455
- )
456
-
457
- generate_btn.click(
458
- fn=tts_interface,
459
- inputs=[
460
- text_input, voice_dropdown, rate_slider, volume_slider, pitch_slider,
461
- enable_cleaning, save_cleaned, clean_urls, clean_html,
462
- clean_ads, fix_enc, tidy_ws, del_gutenberg,
463
- del_special, wetext_norm
464
- ],
465
- outputs=[audio_output, progress_info, status_output, segment_details]
466
- )
467
-
468
- return demo
469
-
470
- async def main():
471
- demo = await create_demo()
472
- demo.queue(default_concurrency_limit=5)
473
- demo.launch(show_api=False)
474
-
475
- if __name__ == "__main__":
476
- asyncio.run(main())
 
 
 
 
 
1
+ import gradio as gr
2
+ import edge_tts
3
+ import asyncio
4
+ import tempfile
5
+ import os
6
+ import re
7
+ from pydub import AudioSegment
8
+ import math
9
+ import time
10
+ from datetime import datetime, timedelta
11
+ import logging
12
+ from text_cleaning import TextCleaner
13
+
14
+ # Configure logging
15
+ logging.basicConfig(
16
+ level=logging.INFO,
17
+ format='%(asctime)s - %(levelname)s - %(message)s',
18
+ handlers=[
19
+ logging.StreamHandler()
20
+ ]
21
+ )
22
+ logger = logging.getLogger(__name__)
23
+
24
+ async def get_voices():
25
+ voices = await edge_tts.list_voices()
26
+ return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
27
+
28
+ def format_time_remaining(seconds):
29
+ """Format seconds into human readable time remaining"""
30
+ if seconds < 60:
31
+ return f"{int(seconds)}s"
32
+ elif seconds < 3600:
33
+ minutes = seconds / 60
34
+ return f"{minutes:.1f}m"
35
+ else:
36
+ hours = seconds / 3600
37
+ return f"{hours:.1f}h"
38
+
39
+ def calculate_eta(start_time, completed_items, total_items):
40
+ """Calculate estimated time remaining"""
41
+ if completed_items == 0:
42
+ return "Calculating..."
43
+
44
+ elapsed_time = time.time() - start_time
45
+ time_per_item = elapsed_time / completed_items
46
+ remaining_items = total_items - completed_items
47
+ remaining_time = time_per_item * remaining_items
48
+
49
+ return format_time_remaining(remaining_time)
50
+
51
+ def estimate_text_duration(text):
52
+ """Estimate speech duration in minutes based on text length"""
53
+ # Simple heuristic:
54
+ # For English (space-separated), ~150 words/min
55
+ # For Chinese (no spaces), ~300 chars/min
56
+ # We'll use a hybrid approach: count spaces to guess if it's space-separated.
57
+
58
+ if not text:
59
+ return 0
60
+
61
+ space_count = text.count(' ')
62
+ total_len = len(text)
63
+
64
+ # If spaces are < 10% of length, assume non-space-separated (like Chinese)
65
+ if space_count / total_len < 0.1:
66
+ # Approx 300 chars per minute for Chinese
67
+ duration = total_len / 300
68
+ # logger.debug(f"Estimated duration (char-based): {duration:.2f} min ({total_len} chars)")
69
+ else:
70
+ # Approx 150 words per minute for English
71
+ word_count = len(text.split())
72
+ duration = word_count / 150
73
+ # logger.debug(f"Estimated duration (word-based): {duration:.2f} min ({word_count} words)")
74
+
75
+ return duration
76
+
77
+ def split_text_by_paragraphs(text, max_duration_minutes=5, max_chars=500):
78
+ """Split text into segments that won't exceed limit with safety margin"""
79
+ max_duration = max_duration_minutes
80
+ estimated_duration = estimate_text_duration(text)
81
+
82
+ logger.info(f"Checking segmentation: Duration={estimated_duration:.2f}m, Chars={len(text)}, Limit={max_duration}m/{max_chars}chars")
83
+
84
+ if estimated_duration <= max_duration and len(text) <= max_chars:
85
+ return [text]
86
+
87
+ logger.info(f"Text exceeds limits. Splitting...")
88
+
89
+ # Split by paragraphs first
90
+ paragraphs = text.split('\n\n')
91
+ segments = []
92
+ current_segment = ""
93
+
94
+ for paragraph in paragraphs:
95
+ paragraph_duration = estimate_text_duration(paragraph)
96
+
97
+ # If single paragraph is too long, split by sentences
98
+ # Improved regex to include Chinese punctuation
99
+ if paragraph_duration > max_duration or len(paragraph) > max_chars:
100
+ sentences = re.split(r'([.!?。!?]+)', paragraph)
101
+ # Re-attach delimiters to sentences
102
+ real_sentences = []
103
+ for i in range(0, len(sentences) - 1, 2):
104
+ real_sentences.append(sentences[i] + sentences[i+1])
105
+ if len(sentences) % 2 == 1 and sentences[-1]:
106
+ real_sentences.append(sentences[-1])
107
+
108
+ for sentence in real_sentences:
109
+ sentence = sentence.strip()
110
+ if not sentence:
111
+ continue
112
+
113
+ # Check both duration and char count
114
+ if (estimate_text_duration(current_segment + sentence) > max_duration or
115
+ len(current_segment + sentence) > max_chars) and current_segment:
116
+ segments.append(current_segment.strip())
117
+ current_segment = sentence
118
+ else:
119
+ current_segment += sentence
120
+ else:
121
+ if (estimate_text_duration(current_segment + paragraph) > max_duration or
122
+ len(current_segment + paragraph) > max_chars) and current_segment:
123
+ segments.append(current_segment.strip())
124
+ current_segment = paragraph + "\n\n"
125
+ else:
126
+ current_segment += paragraph + "\n\n"
127
+
128
+ if current_segment.strip():
129
+ segments.append(current_segment.strip())
130
+
131
+ logger.info(f"Split text into {len(segments)} segments.")
132
+ return segments
133
+
134
+ import io
135
+
136
+ async def generate_audio_segment(text_segment, voice_short_name, rate_str, volume_str, pitch_str, segment_index):
137
+ """Generate audio for a single text segment and return as BytesIO"""
138
+ logger.info(f"Generating segment {segment_index}...")
139
+ communicate = edge_tts.Communicate(text_segment, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str)
140
+
141
+ audio_data = io.BytesIO()
142
+ try:
143
+ async for chunk in communicate.stream():
144
+ if chunk["type"] == "audio":
145
+ audio_data.write(chunk["data"])
146
+ except Exception as e:
147
+ logger.error(f"Error generating segment {segment_index} (Length: {len(text_segment)} chars): {e}")
148
+ raise gr.Error(f"Error generating segment {segment_index}: {e}")
149
+
150
+ audio_data.seek(0)
151
+
152
+ # Verify segment duration
153
+ try:
154
+ # Make a copy for verification so we don't consume the main buffer
155
+ verify_buffer = io.BytesIO(audio_data.getvalue())
156
+ seg_audio = AudioSegment.from_mp3(verify_buffer)
157
+ duration_min = len(seg_audio) / 1000 / 60
158
+ logger.info(f"Segment {segment_index} generated in memory (Duration: {duration_min:.2f} min)")
159
+ except Exception as e:
160
+ logger.error(f"Error checking segment {segment_index} duration: {e}")
161
+
162
+ audio_data.seek(0)
163
+ return audio_data
164
+
165
+ async def merge_audio_files(audio_objects):
166
+ """Merge multiple audio BytesIO objects into one file"""
167
+ if not audio_objects:
168
+ return None
169
+
170
+ logger.info(f"Merging {len(audio_objects)} audio segments...")
171
+
172
+ # Load and merge audio segments
173
+ combined = AudioSegment.empty()
174
+ for i, audio_obj in enumerate(audio_objects):
175
+ try:
176
+ audio_obj.seek(0)
177
+ segment = AudioSegment.from_mp3(audio_obj)
178
+ combined += segment
179
+ # Explicitly close/clear the BytesIO object to free memory
180
+ audio_obj.close()
181
+ except Exception as e:
182
+ logger.error(f"Error merging segment {i+1}: {e}")
183
+
184
+ # Save merged audio to a single temporary file
185
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
186
+ merged_path = tmp_file.name
187
+ combined.export(merged_path, format="mp3")
188
+
189
+ total_duration_min = len(combined) / 1000 / 60
190
+ logger.info(f"Merged audio saved to {merged_path} (Total Duration: {total_duration_min:.2f} min)")
191
+ return merged_path
192
+
193
+ async def text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_options=None):
194
+ """Generate speech with detailed progress tracking via generator"""
195
+ if not text.strip():
196
+ yield None, "Please enter text to convert.", None
197
+ return
198
+ if not voice:
199
+ yield None, "Please select a voice.", None
200
+ return
201
+
202
+ # Apply text cleaning if enabled
203
+ if cleaning_options and cleaning_options.get('enable_cleaning', False):
204
+ yield 0, "Cleaning text...", None
205
+ # original_text = text # Unused
206
+ text = TextCleaner.clean_text(text, cleaning_options)
207
+
208
+ if cleaning_options.get('save_cleaned', False):
209
+ # Create a filename based on timestamp or first few words
210
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
211
+ filename = f"text_{timestamp}.txt"
212
+ saved_path = TextCleaner.save_cleaned_text(text, filename)
213
+ if saved_path:
214
+ logger.info(f"Saved cleaned text to {saved_path}")
215
+
216
+ if not text.strip():
217
+ yield None, "Text cleaning resulted in empty text.", None
218
+ return
219
+
220
+ voice_short_name = voice.split(" - ")[0]
221
+ rate_str = f"{rate:+d}%"
222
+ volume_str = f"{volume:+d}%"
223
+ pitch_str = f"{pitch:+d}Hz"
224
+
225
+ # Check if text is too long and needs segmentation
226
+ estimated_duration = estimate_text_duration(text)
227
+
228
+ yield 0, "Starting text processing...", None
229
+ logger.info(f"Starting TTS for text with estimated duration: {estimated_duration:.2f}m")
230
+
231
+ if estimated_duration > 15: # If longer than 15 minutes, split into segments
232
+ segments = split_text_by_paragraphs(text)
233
+ total_segments = len(segments)
234
+
235
+ segment_info = f"Text split into {total_segments} segments. Total estimated duration: {estimated_duration:.1f} min"
236
+ yield 5, segment_info, segment_info
237
+
238
+ if total_segments > 1:
239
+ # Generate audio for each segment with progress tracking
240
+ audio_objects = []
241
+ start_time = time.time()
242
+
243
+ for i, segment in enumerate(segments):
244
+ if segment.strip():
245
+ segment_duration = estimate_text_duration(segment)
246
+
247
+ progress = 10 + (80 * i / total_segments) # 10% to 90%
248
+ eta = calculate_eta(start_time, i, total_segments)
249
+ status_msg = (
250
+ f"Generating segment {i+1}/{total_segments}...\n"
251
+ f"Segment duration: {segment_duration:.1f} min\n"
252
+ f"ETA: {eta}"
253
+ )
254
+ logger.info(f"Progress: {status_msg.replace(chr(10), ', ')}")
255
+ yield progress, status_msg, segment_info
256
+
257
+ # Generate to memory
258
+ audio_obj = await generate_audio_segment(
259
+ segment, voice_short_name, rate_str, volume_str, pitch_str, i+1
260
+ )
261
+ audio_objects.append(audio_obj)
262
+
263
+ yield 90, "Merging audio files...", segment_info
264
+
265
+ # Merge all audio objects
266
+ merged_audio_path = await merge_audio_files(audio_objects)
267
+
268
+ yield 100, "Audio generation complete! ✅", segment_info
269
+ yield merged_audio_path, "Done", segment_info
270
+ return
271
+
272
+ # For short texts or single segment, use original method
273
+ yield 50, "Generating audio...", None
274
+
275
+ logger.info("Generating single segment audio...")
276
+ communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str)
277
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
278
+ tmp_path = tmp_file.name
279
+ await communicate.save(tmp_path)
280
+
281
+ logger.info(f"Audio generated at {tmp_path}")
282
+ yield 100, "Audio generation complete! ✅", None
283
+ yield tmp_path, "Done", None
284
+
285
+ async def tts_interface(text, voice, rate, volume, pitch,
286
+ enable_cleaning, save_cleaned, clean_urls, clean_html,
287
+ clean_markdown, clean_ads, fix_enc, tidy_ws, del_gutenberg,
288
+ del_special, wetext_norm):
289
+ """Enhanced TTS interface with detailed progress tracking"""
290
+ if not text.strip():
291
+ yield None, gr.update(visible=False), "Please enter text.", gr.update(visible=False)
292
+ return
293
+ if not voice:
294
+ yield None, gr.update(visible=False), "Please select a voice.", gr.update(visible=False)
295
+ return
296
+
297
+ # Prepare cleaning options
298
+ cleaning_options = {
299
+ 'enable_cleaning': enable_cleaning,
300
+ 'save_cleaned': save_cleaned,
301
+ 'remove_urls': clean_urls,
302
+ 'remove_html': clean_html,
303
+ 'remove_markdown': clean_markdown,
304
+ 'filter_ads': clean_ads,
305
+ 'fix_encoding': fix_enc,
306
+ 'tidy_whitespace': tidy_ws,
307
+ 'remove_gutenberg': del_gutenberg,
308
+ 'remove_special_chars': del_special,
309
+ 'wetext_normalization': wetext_norm
310
+ }
311
+
312
+ # We need to clean text here first to estimate duration correctly?
313
+ # Or let the generator handle it. The generator handles it, but estimation might be off.
314
+ # Ideally we clean first if enabled, then estimate.
315
+
316
+ working_text = text
317
+ if enable_cleaning:
318
+ working_text = TextCleaner.clean_text(text, cleaning_options)
319
+ if save_cleaned:
320
+ # We'll let the generator save it to avoid double saving or complex logic here,
321
+ # but we need to pass the options.
322
+ pass
323
+
324
+ estimated_duration = estimate_text_duration(working_text)
325
+
326
+ # Reset UI
327
+ yield None, gr.update(value="Starting...", visible=True), "Initializing...", gr.update(visible=False)
328
+
329
+ async for result in text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_options):
330
+ if isinstance(result, tuple) and len(result) == 3:
331
+ # Progress update
332
+ progress_val, status_msg, segment_info = result
333
+
334
+ if isinstance(progress_val, (int, float)):
335
+ # It's a progress update
336
+ segment_update = gr.update(value=segment_info, visible=True) if segment_info else gr.update(visible=False)
337
+ yield None, gr.update(value=status_msg, visible=True), status_msg, segment_update
338
+ else:
339
+ # It's the final result (path, msg, info)
340
+ audio_path = progress_val
341
+ yield audio_path, gr.update(value="Complete!", visible=True), "Generation Complete", gr.update(visible=True)
342
+
343
+ async def create_demo():
344
+ voices = await get_voices()
345
+
346
+ description = """
347
+ Convert text to speech using Microsoft Edge TTS. Adjust speech rate and pitch: 0 is default, positive values increase, negative values decrease.
348
+
349
+ 🎥 **Exciting News: Introducing our Text-to-Video Converter!** 🎥
350
+
351
+ Take your content creation to the next level with our cutting-edge Text-to-Video Converter!
352
+ Transform your words into stunning, professional-quality videos in just a few clicks.
353
+
354
+ Features:
355
+ Convert text to engaging videos with customizable visuals
356
+ Choose from 40+ languages and 300+ voices
357
+ Perfect for creating audiobooks, storytelling, and language learning materials
358
+ • Ideal for educators, content creators, and language enthusiasts
359
+
360
+ 📝 **Long Text Support**:
361
+ Texts longer than 15 minutes will be **automatically segmented** into smaller chunks for processing and then **merged back** into a single high-quality audio file. This ensures stability and allows for unlimited text length!
362
+ """
363
+
364
+ default_voice = ""
365
+ for voice_key in voices.keys():
366
+ if "XiaoxiaoNeural" in voice_key:
367
+ default_voice = voice_key
368
+ break
369
+
370
+ with gr.Blocks(title="Edge TTS Text-to-Speech") as demo:
371
+ gr.Markdown("# Edge TTS Text-to-Speech")
372
+ gr.Markdown(description)
373
+
374
+ with gr.Row():
375
+ with gr.Column():
376
+ text_input = gr.Textbox(label="Input Text", lines=8, placeholder="Enter your text here... Long texts will be automatically segmented if they exceed 15 minutes of speech time.")
377
+
378
+ # Add text analysis info
379
+ text_info = gr.Markdown("**Text Analysis**: Enter text to see estimated duration and segment count", visible=True)
380
+
381
+ with gr.Accordion("Text Cleaning Settings", open=True):
382
+ with gr.Row():
383
+ enable_cleaning = gr.Checkbox(label="Enable Text Cleaning", value=True)
384
+ save_cleaned = gr.Checkbox(label="Save Cleaned Text File", value=True)
385
+
386
+ with gr.Group(visible=True) as cleaning_options_group:
387
+ with gr.Row():
388
+ clean_urls = gr.Checkbox(label="Remove URLs", value=True)
389
+ clean_html = gr.Checkbox(label="Remove HTML", value=True)
390
+
391
+ with gr.Row():
392
+ clean_markdown = gr.Checkbox(label="Remove Markdown", value=True)
393
+ clean_ads = gr.Checkbox(label="Filter Ads", value=True)
394
+
395
+ with gr.Row():
396
+ fix_enc = gr.Checkbox(label="Fix Encoding", value=True)
397
+ tidy_ws = gr.Checkbox(label="Tidy Whitespace", value=True)
398
+
399
+ with gr.Row():
400
+ del_gutenberg = gr.Checkbox(label="Remove Project Gutenberg", value=True)
401
+ del_special = gr.Checkbox(label="Remove Special Characters", value=True)
402
+
403
+ with gr.Row():
404
+ wetext_norm = gr.Checkbox(label="Enable WeText Normalization", value=True)
405
+
406
+ def toggle_options(enabled):
407
+ return gr.update(visible=enabled)
408
+
409
+ enable_cleaning.change(fn=toggle_options, inputs=[enable_cleaning], outputs=[cleaning_options_group])
410
+
411
+ voice_dropdown = gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice)
412
+
413
+ with gr.Row():
414
+ rate_slider = gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate (%)", step=1)
415
+ volume_slider = gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Volume (%)", step=1)
416
+ pitch_slider = gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch (Hz)", step=1)
417
+
418
+ generate_btn = gr.Button("Generate Audio", variant="primary")
419
+
420
+ with gr.Column():
421
+ audio_output = gr.Audio(label="Generated Audio", type="filepath")
422
+
423
+ # Progress and status display
424
+ with gr.Group():
425
+ gr.Markdown("### 📊 Processing Progress")
426
+ progress_info = gr.Markdown("Ready, click Generate to start...", visible=True)
427
+
428
+ # Processing details
429
+ with gr.Accordion("🔍 Processing Details", open=True) as processing_details:
430
+ status_output = gr.Markdown("Waiting...", visible=True)
431
+
432
+ # Segment information display
433
+ with gr.Accordion("📋 Segment Information", open=True) as segment_info:
434
+ segment_details = gr.Markdown("Segment details will appear here for long texts", visible=True)
435
+
436
+ gr.Markdown("Experience the power of Edge TTS for text-to-speech conversion, and explore our advanced Text-to-Video Converter for even more creative possibilities!")
437
+
438
+ # Add text analysis function
439
+ def analyze_text(text):
440
+ if not text.strip():
441
+ return "**Text Analysis**: Enter text to see estimated duration and segment count"
442
+
443
+ duration = estimate_text_duration(text)
444
+ word_count = len(text.split())
445
+ char_count = len(text)
446
+
447
+ if duration > 15:
448
+ segments = split_text_by_paragraphs(text)
449
+ segment_count = len(segments)
450
+ return f"**Text Analysis**: {word_count} words, {char_count} characters, ~{duration:.1f} minutes speech time, {segment_count} segments will be generated"
451
+ else:
452
+ return f"**Text Analysis**: {word_count} words, {char_count} characters, ~{duration:.1f} minutes speech time"
453
+
454
+ # Update text analysis when text changes
455
+ text_input.change(
456
+ fn=analyze_text,
457
+ inputs=[text_input],
458
+ outputs=[text_info]
459
+ )
460
+
461
+ generate_btn.click(
462
+ fn=tts_interface,
463
+ inputs=[
464
+ text_input, voice_dropdown, rate_slider, volume_slider, pitch_slider,
465
+ enable_cleaning, save_cleaned, clean_urls, clean_html,
466
+ clean_markdown, clean_ads, fix_enc, tidy_ws, del_gutenberg,
467
+ del_special, wetext_norm
468
+ ],
469
+ outputs=[audio_output, progress_info, status_output, segment_details]
470
+ )
471
+
472
+ return demo
473
+
474
+ async def main():
475
+ demo = await create_demo()
476
+ demo.queue(default_concurrency_limit=5)
477
+ demo.launch(show_api=False)
478
+
479
+ if __name__ == "__main__":
480
+ asyncio.run(main())
text_cleaning.py CHANGED
@@ -82,6 +82,47 @@ class TextCleaner:
82
 
83
  return '\n'.join(lines[start_idx:end_idx])
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  @staticmethod
86
  def remove_special_chars(text):
87
  """Remove excessive special characters"""
@@ -128,6 +169,9 @@ class TextCleaner:
128
  if options.get('remove_html', False):
129
  text = cls.remove_html(text)
130
 
 
 
 
131
  if options.get('remove_urls', False):
132
  text = cls.remove_urls(text)
133
 
 
82
 
83
  return '\n'.join(lines[start_idx:end_idx])
84
 
85
+ @staticmethod
86
+ def remove_markdown(text):
87
+ """Remove markdown formatting symbols"""
88
+ # Remove code blocks first (```code```)
89
+ text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)
90
+
91
+ # Remove inline code (`code`)
92
+ text = re.sub(r'`([^`]+)`', r'\1', text)
93
+
94
+ # Remove bold (**text** or __text__)
95
+ text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
96
+ text = re.sub(r'__(.+?)__', r'\1', text)
97
+
98
+ # Remove italic (*text* or _text_)
99
+ text = re.sub(r'\*(.+?)\*', r'\1', text)
100
+ text = re.sub(r'_(.+?)_', r'\1', text)
101
+
102
+ # Remove strikethrough (~~text~~)
103
+ text = re.sub(r'~~(.+?)~~', r'\1', text)
104
+
105
+ # Remove headers (# ## ### etc.)
106
+ text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE)
107
+
108
+ # Remove links [text](url) -> text
109
+ text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
110
+
111
+ # Remove images ![alt](url)
112
+ text = re.sub(r'!\[([^\]]*)\]\([^\)]+\)', r'\1', text)
113
+
114
+ # Remove blockquotes (> text)
115
+ text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE)
116
+
117
+ # Remove horizontal rules (---, ***, ___)
118
+ text = re.sub(r'^[\-\*_]{3,}\s*$', '', text, flags=re.MULTILINE)
119
+
120
+ # Remove list markers (-, *, +, 1., 2., etc.)
121
+ text = re.sub(r'^\s*[\-\*\+]\s+', '', text, flags=re.MULTILINE)
122
+ text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
123
+
124
+ return text
125
+
126
  @staticmethod
127
  def remove_special_chars(text):
128
  """Remove excessive special characters"""
 
169
  if options.get('remove_html', False):
170
  text = cls.remove_html(text)
171
 
172
+ if options.get('remove_markdown', False):
173
+ text = cls.remove_markdown(text)
174
+
175
  if options.get('remove_urls', False):
176
  text = cls.remove_urls(text)
177