cs2764 commited on
Commit
11efcf3
·
verified ·
1 Parent(s): e5d9894

Upload 2 files

Browse files

Add long text support

Files changed (2) hide show
  1. app.py +319 -28
  2. requirements.txt +3 -2
app.py CHANGED
@@ -3,32 +3,273 @@ import edge_tts
3
  import asyncio
4
  import tempfile
5
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  async def get_voices():
8
  voices = await edge_tts.list_voices()
9
  return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
10
 
11
- async def text_to_speech(text, voice, rate, volume, pitch):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  if not text.strip():
13
- return None, "Please enter text to convert."
 
14
  if not voice:
15
- return None, "Please select a voice."
 
16
 
17
  voice_short_name = voice.split(" - ")[0]
18
  rate_str = f"{rate:+d}%"
19
  volume_str = f"{volume:+d}%"
20
  pitch_str = f"{pitch:+d}Hz"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str)
22
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
23
  tmp_path = tmp_file.name
24
  await communicate.save(tmp_path)
25
- return tmp_path, None
 
 
 
26
 
27
  async def tts_interface(text, voice, rate, volume, pitch):
28
- audio, warning = await text_to_speech(text, voice, rate, volume, pitch)
29
- if warning:
30
- return audio, gr.Warning(warning)
31
- return audio, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  async def create_demo():
34
  voices = await get_voices()
@@ -46,6 +287,9 @@ async def create_demo():
46
  • Choose from 40+ languages and 300+ voices
47
  • Perfect for creating audiobooks, storytelling, and language learning materials
48
  • Ideal for educators, content creators, and language enthusiasts
 
 
 
49
  """
50
 
51
  default_voice = ""
@@ -54,26 +298,73 @@ async def create_demo():
54
  default_voice = voice_key
55
  break
56
 
57
- demo = gr.Interface(
58
- fn=tts_interface,
59
- inputs=[
60
- gr.Textbox(label="Input Text", lines=5),
61
- gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
62
- gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
63
- gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Volume Adjustment (%)", step=1),
64
- gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1)
65
- ],
66
- outputs=[
67
- gr.Audio(label="Generated Audio", type="filepath"),
68
- gr.Markdown(label="Warning", visible=False)
69
- ],
70
- title="Edge TTS Text-to-Speech",
71
- description=description,
72
- article="Experience the power of Edge TTS for text-to-speech conversion, and explore our advanced Text-to-Video Converter for even more creative possibilities!",
73
- analytics_enabled=False,
74
- allow_flagging="manual",
75
- api_name=None
76
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  return demo
78
 
79
  async def main():
 
3
  import asyncio
4
  import tempfile
5
  import os
6
+ import re
7
+ from pydub import AudioSegment
8
+ import math
9
+ import time
10
+ from datetime import datetime, timedelta
11
+ import logging
12
+
13
+ # Configure logging
14
+ logging.basicConfig(
15
+ level=logging.INFO,
16
+ format='%(asctime)s - %(levelname)s - %(message)s',
17
+ handlers=[
18
+ logging.StreamHandler()
19
+ ]
20
+ )
21
+ logger = logging.getLogger(__name__)
22
 
23
  async def get_voices():
24
  voices = await edge_tts.list_voices()
25
  return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
26
 
27
+ def format_time_remaining(seconds):
28
+ """Format seconds into human readable time remaining"""
29
+ if seconds < 60:
30
+ return f"{int(seconds)}s"
31
+ elif seconds < 3600:
32
+ minutes = seconds / 60
33
+ return f"{minutes:.1f}m"
34
+ else:
35
+ hours = seconds / 3600
36
+ return f"{hours:.1f}h"
37
+
38
+ def calculate_eta(start_time, completed_items, total_items):
39
+ """Calculate estimated time remaining"""
40
+ if completed_items == 0:
41
+ return "Calculating..."
42
+
43
+ elapsed_time = time.time() - start_time
44
+ time_per_item = elapsed_time / completed_items
45
+ remaining_items = total_items - completed_items
46
+ remaining_time = time_per_item * remaining_items
47
+
48
+ return format_time_remaining(remaining_time)
49
+
50
+ def estimate_text_duration(text):
51
+ """Estimate speech duration in minutes based on text length"""
52
+ # Simple heuristic:
53
+ # For English (space-separated), ~150 words/min
54
+ # For Chinese (no spaces), ~300 chars/min
55
+ # We'll use a hybrid approach: count spaces to guess if it's space-separated.
56
+
57
+ if not text:
58
+ return 0
59
+
60
+ space_count = text.count(' ')
61
+ total_len = len(text)
62
+
63
+ # If spaces are < 10% of length, assume non-space-separated (like Chinese)
64
+ if space_count / total_len < 0.1:
65
+ # Approx 300 chars per minute for Chinese
66
+ duration = total_len / 300
67
+ logger.info(f"Estimated duration (char-based): {duration:.2f} min ({total_len} chars)")
68
+ else:
69
+ # Approx 150 words per minute for English
70
+ word_count = len(text.split())
71
+ duration = word_count / 150
72
+ logger.info(f"Estimated duration (word-based): {duration:.2f} min ({word_count} words)")
73
+
74
+ return duration
75
+
76
+ def split_text_by_paragraphs(text, max_duration_minutes=5):
77
+ """Split text into segments that won't exceed limit with safety margin"""
78
+ max_duration = max_duration_minutes
79
+ estimated_duration = estimate_text_duration(text)
80
+
81
+ logger.info(f"Checking segmentation: Duration={estimated_duration:.2f}m, Limit={max_duration}m")
82
+
83
+ if estimated_duration <= max_duration:
84
+ return [text]
85
+
86
+ logger.info(f"Text duration ({estimated_duration:.2f}m) exceeds limit ({max_duration}m). Splitting...")
87
+
88
+ # Split by paragraphs first
89
+ paragraphs = text.split('\n\n')
90
+ segments = []
91
+ current_segment = ""
92
+
93
+ for paragraph in paragraphs:
94
+ paragraph_duration = estimate_text_duration(paragraph)
95
+
96
+ # If single paragraph is too long, split by sentences
97
+ if paragraph_duration > max_duration:
98
+ sentences = re.split(r'[.!?]+', paragraph)
99
+ for sentence in sentences:
100
+ sentence = sentence.strip()
101
+ if not sentence:
102
+ continue
103
+
104
+ if estimate_text_duration(current_segment + sentence) > max_duration and current_segment:
105
+ segments.append(current_segment.strip())
106
+ current_segment = sentence + ". "
107
+ else:
108
+ current_segment += sentence + ". "
109
+ else:
110
+ if estimate_text_duration(current_segment + paragraph) > max_duration and current_segment:
111
+ segments.append(current_segment.strip())
112
+ current_segment = paragraph + "\n\n"
113
+ else:
114
+ current_segment += paragraph + "\n\n"
115
+
116
+ if current_segment.strip():
117
+ segments.append(current_segment.strip())
118
+
119
+ logger.info(f"Split text into {len(segments)} segments.")
120
+ return segments
121
+
122
+ async def generate_audio_segment(text_segment, voice_short_name, rate_str, volume_str, pitch_str, segment_index):
123
+ """Generate audio for a single text segment"""
124
+ logger.info(f"Generating segment {segment_index}...")
125
+ communicate = edge_tts.Communicate(text_segment, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str)
126
+ with tempfile.NamedTemporaryFile(delete=False, suffix=f"_segment_{segment_index}.mp3") as tmp_file:
127
+ tmp_path = tmp_file.name
128
+ await communicate.save(tmp_path)
129
+
130
+ # Verify segment duration
131
+ try:
132
+ seg_audio = AudioSegment.from_mp3(tmp_path)
133
+ duration_min = len(seg_audio) / 1000 / 60
134
+ logger.info(f"Segment {segment_index} generated at {tmp_path} (Duration: {duration_min:.2f} min)")
135
+ except Exception as e:
136
+ logger.error(f"Error checking segment {segment_index} duration: {e}")
137
+
138
+ return tmp_path
139
+
140
+ async def merge_audio_files(audio_files):
141
+ """Merge multiple audio files into one"""
142
+ if not audio_files:
143
+ return None
144
+
145
+ if len(audio_files) == 1:
146
+ return audio_files[0]
147
+
148
+ logger.info(f"Merging {len(audio_files)} audio files...")
149
+ # Load and merge audio segments
150
+ combined = AudioSegment.empty()
151
+ for audio_file in audio_files:
152
+ try:
153
+ segment = AudioSegment.from_mp3(audio_file)
154
+ combined += segment
155
+ except Exception as e:
156
+ logger.error(f"Error merging file {audio_file}: {e}")
157
+
158
+ # Clean up temporary segment file
159
+ try:
160
+ os.remove(audio_file)
161
+ except:
162
+ pass
163
+
164
+ # Save merged audio
165
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
166
+ merged_path = tmp_file.name
167
+ combined.export(merged_path, format="mp3")
168
+
169
+ total_duration_min = len(combined) / 1000 / 60
170
+ logger.info(f"Merged audio saved to {merged_path} (Total Duration: {total_duration_min:.2f} min)")
171
+ return merged_path
172
+
173
+ async def text_to_speech_generator(text, voice, rate, volume, pitch):
174
+ """Generate speech with detailed progress tracking via generator"""
175
  if not text.strip():
176
+ yield None, "Please enter text to convert.", None
177
+ return
178
  if not voice:
179
+ yield None, "Please select a voice.", None
180
+ return
181
 
182
  voice_short_name = voice.split(" - ")[0]
183
  rate_str = f"{rate:+d}%"
184
  volume_str = f"{volume:+d}%"
185
  pitch_str = f"{pitch:+d}Hz"
186
+
187
+ # Check if text is too long and needs segmentation
188
+ estimated_duration = estimate_text_duration(text)
189
+
190
+ yield 0, "Starting text processing...", None
191
+ logger.info(f"Starting TTS for text with estimated duration: {estimated_duration:.2f}m")
192
+
193
+ if estimated_duration > 15: # If longer than 15 minutes, split into segments
194
+ segments = split_text_by_paragraphs(text)
195
+ total_segments = len(segments)
196
+
197
+ segment_info = f"Text split into {total_segments} segments. Total estimated duration: {estimated_duration:.1f} min"
198
+ yield 5, segment_info, segment_info
199
+
200
+ if total_segments > 1:
201
+ # Generate audio for each segment with progress tracking
202
+ audio_files = []
203
+ start_time = time.time()
204
+
205
+ for i, segment in enumerate(segments):
206
+ if segment.strip():
207
+ segment_duration = estimate_text_duration(segment)
208
+
209
+ progress = 10 + (80 * i / total_segments) # 10% to 90%
210
+ eta = calculate_eta(start_time, i, total_segments)
211
+ status_msg = (
212
+ f"Generating segment {i+1}/{total_segments}...\n"
213
+ f"Segment duration: {segment_duration:.1f} min\n"
214
+ f"ETA: {eta}"
215
+ )
216
+ logger.info(f"Progress: {status_msg.replace(chr(10), ', ')}")
217
+ yield progress, status_msg, segment_info
218
+
219
+ audio_file = await generate_audio_segment(
220
+ segment, voice_short_name, rate_str, volume_str, pitch_str, i+1
221
+ )
222
+ audio_files.append(audio_file)
223
+
224
+ yield 90, "Merging audio files...", segment_info
225
+
226
+ # Merge all audio files
227
+ merged_audio = await merge_audio_files(audio_files)
228
+
229
+ yield 100, "Audio generation complete! ✅", segment_info
230
+ yield merged_audio, "Done", segment_info
231
+ return
232
+
233
+ # For short texts or single segment, use original method
234
+ yield 50, "Generating audio...", None
235
+
236
+ logger.info("Generating single segment audio...")
237
  communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str)
238
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
239
  tmp_path = tmp_file.name
240
  await communicate.save(tmp_path)
241
+
242
+ logger.info(f"Audio generated at {tmp_path}")
243
+ yield 100, "Audio generation complete! ✅", None
244
+ yield tmp_path, "Done", None
245
 
246
  async def tts_interface(text, voice, rate, volume, pitch):
247
+ """Enhanced TTS interface with detailed progress tracking"""
248
+ if not text.strip():
249
+ yield None, gr.update(visible=False), "Please enter text.", gr.update(visible=False)
250
+ return
251
+ if not voice:
252
+ yield None, gr.update(visible=False), "Please select a voice.", gr.update(visible=False)
253
+ return
254
+
255
+ estimated_duration = estimate_text_duration(text)
256
+
257
+ # Reset UI
258
+ yield None, gr.update(value="Starting...", visible=True), "Initializing...", gr.update(visible=False)
259
+
260
+ async for result in text_to_speech_generator(text, voice, rate, volume, pitch):
261
+ if isinstance(result, tuple) and len(result) == 3:
262
+ # Progress update
263
+ progress_val, status_msg, segment_info = result
264
+
265
+ if isinstance(progress_val, (int, float)):
266
+ # It's a progress update
267
+ segment_update = gr.update(value=segment_info, visible=True) if segment_info else gr.update(visible=False)
268
+ yield None, gr.update(value=status_msg, visible=True), status_msg, segment_update
269
+ else:
270
+ # It's the final result (path, msg, info)
271
+ audio_path = progress_val
272
+ yield audio_path, gr.update(value="Complete!", visible=True), "Generation Complete", gr.update(visible=True)
273
 
274
  async def create_demo():
275
  voices = await get_voices()
 
287
  • Choose from 40+ languages and 300+ voices
288
  • Perfect for creating audiobooks, storytelling, and language learning materials
289
  • Ideal for educators, content creators, and language enthusiasts
290
+
291
+ 📝 **Long Text Support**:
292
+ Texts longer than 15 minutes will be **automatically segmented** into smaller chunks for processing and then **merged back** into a single high-quality audio file. This ensures stability and allows for unlimited text length!
293
  """
294
 
295
  default_voice = ""
 
298
  default_voice = voice_key
299
  break
300
 
301
+ with gr.Blocks(title="Edge TTS Text-to-Speech") as demo:
302
+ gr.Markdown("# Edge TTS Text-to-Speech")
303
+ gr.Markdown(description)
304
+
305
+ with gr.Row():
306
+ with gr.Column():
307
+ text_input = gr.Textbox(label="Input Text", lines=8, placeholder="Enter your text here... Long texts will be automatically segmented if they exceed 15 minutes of speech time.")
308
+
309
+ # Add text analysis info
310
+ text_info = gr.Markdown("**Text Analysis**: Enter text to see estimated duration and segment count", visible=True)
311
+
312
+ voice_dropdown = gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice)
313
+
314
+ with gr.Row():
315
+ rate_slider = gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate (%)", step=1)
316
+ volume_slider = gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Volume (%)", step=1)
317
+ pitch_slider = gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch (Hz)", step=1)
318
+
319
+ generate_btn = gr.Button("Generate Audio", variant="primary")
320
+
321
+ with gr.Column():
322
+ audio_output = gr.Audio(label="Generated Audio", type="filepath")
323
+
324
+ # Progress and status display
325
+ with gr.Group():
326
+ gr.Markdown("### 📊 Processing Progress")
327
+ progress_info = gr.Markdown("Ready, click Generate to start...", visible=True)
328
+
329
+ # Processing details
330
+ with gr.Accordion("🔍 Processing Details", open=True) as processing_details:
331
+ status_output = gr.Markdown("Waiting...", visible=True)
332
+
333
+ # Segment information display
334
+ with gr.Accordion("📋 Segment Information", open=True) as segment_info:
335
+ segment_details = gr.Markdown("Segment details will appear here for long texts", visible=True)
336
+
337
+ gr.Markdown("Experience the power of Edge TTS for text-to-speech conversion, and explore our advanced Text-to-Video Converter for even more creative possibilities!")
338
+
339
+ # Add text analysis function
340
+ def analyze_text(text):
341
+ if not text.strip():
342
+ return "**Text Analysis**: Enter text to see estimated duration and segment count"
343
+
344
+ duration = estimate_text_duration(text)
345
+ word_count = len(text.split())
346
+ char_count = len(text)
347
+
348
+ if duration > 15:
349
+ segments = split_text_by_paragraphs(text)
350
+ segment_count = len(segments)
351
+ return f"**Text Analysis**: {word_count} words, {char_count} characters, ~{duration:.1f} minutes speech time, {segment_count} segments will be generated"
352
+ else:
353
+ return f"**Text Analysis**: {word_count} words, {char_count} characters, ~{duration:.1f} minutes speech time"
354
+
355
+ # Update text analysis when text changes
356
+ text_input.change(
357
+ fn=analyze_text,
358
+ inputs=[text_input],
359
+ outputs=[text_info]
360
+ )
361
+
362
+ generate_btn.click(
363
+ fn=tts_interface,
364
+ inputs=[text_input, voice_dropdown, rate_slider, volume_slider, pitch_slider],
365
+ outputs=[audio_output, progress_info, status_output, segment_details]
366
+ )
367
+
368
  return demo
369
 
370
  async def main():
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
- edge_tts==7.0.0
2
- gradio==5.21.0
 
 
1
+ edge_tts>=7.0.0
2
+ gradio>=4.0.0
3
+ pydub>=0.25.1