cs2764 commited on
Commit
222dec9
·
verified ·
1 Parent(s): 11efcf3

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -35
app.py CHANGED
@@ -9,6 +9,7 @@ import math
9
  import time
10
  from datetime import datetime, timedelta
11
  import logging
 
12
 
13
  # Configure logging
14
  logging.basicConfig(
@@ -119,49 +120,53 @@ def split_text_by_paragraphs(text, max_duration_minutes=5):
119
  logger.info(f"Split text into {len(segments)} segments.")
120
  return segments
121
 
 
 
122
  async def generate_audio_segment(text_segment, voice_short_name, rate_str, volume_str, pitch_str, segment_index):
123
- """Generate audio for a single text segment"""
124
  logger.info(f"Generating segment {segment_index}...")
125
  communicate = edge_tts.Communicate(text_segment, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str)
126
- with tempfile.NamedTemporaryFile(delete=False, suffix=f"_segment_{segment_index}.mp3") as tmp_file:
127
- tmp_path = tmp_file.name
128
- await communicate.save(tmp_path)
 
 
 
 
129
 
130
  # Verify segment duration
131
  try:
132
- seg_audio = AudioSegment.from_mp3(tmp_path)
 
 
133
  duration_min = len(seg_audio) / 1000 / 60
134
- logger.info(f"Segment {segment_index} generated at {tmp_path} (Duration: {duration_min:.2f} min)")
135
  except Exception as e:
136
  logger.error(f"Error checking segment {segment_index} duration: {e}")
137
 
138
- return tmp_path
 
139
 
140
- async def merge_audio_files(audio_files):
141
- """Merge multiple audio files into one"""
142
- if not audio_files:
143
  return None
144
 
145
- if len(audio_files) == 1:
146
- return audio_files[0]
147
 
148
- logger.info(f"Merging {len(audio_files)} audio files...")
149
  # Load and merge audio segments
150
  combined = AudioSegment.empty()
151
- for audio_file in audio_files:
152
  try:
153
- segment = AudioSegment.from_mp3(audio_file)
 
154
  combined += segment
 
 
155
  except Exception as e:
156
- logger.error(f"Error merging file {audio_file}: {e}")
157
-
158
- # Clean up temporary segment file
159
- try:
160
- os.remove(audio_file)
161
- except:
162
- pass
163
 
164
- # Save merged audio
165
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
166
  merged_path = tmp_file.name
167
  combined.export(merged_path, format="mp3")
@@ -170,7 +175,7 @@ async def merge_audio_files(audio_files):
170
  logger.info(f"Merged audio saved to {merged_path} (Total Duration: {total_duration_min:.2f} min)")
171
  return merged_path
172
 
173
- async def text_to_speech_generator(text, voice, rate, volume, pitch):
174
  """Generate speech with detailed progress tracking via generator"""
175
  if not text.strip():
176
  yield None, "Please enter text to convert.", None
@@ -178,7 +183,25 @@ async def text_to_speech_generator(text, voice, rate, volume, pitch):
178
  if not voice:
179
  yield None, "Please select a voice.", None
180
  return
181
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  voice_short_name = voice.split(" - ")[0]
183
  rate_str = f"{rate:+d}%"
184
  volume_str = f"{volume:+d}%"
@@ -199,7 +222,7 @@ async def text_to_speech_generator(text, voice, rate, volume, pitch):
199
 
200
  if total_segments > 1:
201
  # Generate audio for each segment with progress tracking
202
- audio_files = []
203
  start_time = time.time()
204
 
205
  for i, segment in enumerate(segments):
@@ -216,18 +239,19 @@ async def text_to_speech_generator(text, voice, rate, volume, pitch):
216
  logger.info(f"Progress: {status_msg.replace(chr(10), ', ')}")
217
  yield progress, status_msg, segment_info
218
 
219
- audio_file = await generate_audio_segment(
 
220
  segment, voice_short_name, rate_str, volume_str, pitch_str, i+1
221
  )
222
- audio_files.append(audio_file)
223
 
224
  yield 90, "Merging audio files...", segment_info
225
 
226
- # Merge all audio files
227
- merged_audio = await merge_audio_files(audio_files)
228
 
229
  yield 100, "Audio generation complete! ✅", segment_info
230
- yield merged_audio, "Done", segment_info
231
  return
232
 
233
  # For short texts or single segment, use original method
@@ -243,7 +267,10 @@ async def text_to_speech_generator(text, voice, rate, volume, pitch):
243
  yield 100, "Audio generation complete! ✅", None
244
  yield tmp_path, "Done", None
245
 
246
- async def tts_interface(text, voice, rate, volume, pitch):
 
 
 
247
  """Enhanced TTS interface with detailed progress tracking"""
248
  if not text.strip():
249
  yield None, gr.update(visible=False), "Please enter text.", gr.update(visible=False)
@@ -252,12 +279,38 @@ async def tts_interface(text, voice, rate, volume, pitch):
252
  yield None, gr.update(visible=False), "Please select a voice.", gr.update(visible=False)
253
  return
254
 
255
- estimated_duration = estimate_text_duration(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
  # Reset UI
258
  yield None, gr.update(value="Starting...", visible=True), "Initializing...", gr.update(visible=False)
259
 
260
- async for result in text_to_speech_generator(text, voice, rate, volume, pitch):
261
  if isinstance(result, tuple) and len(result) == 3:
262
  # Progress update
263
  progress_val, status_msg, segment_info = result
@@ -309,6 +362,33 @@ async def create_demo():
309
  # Add text analysis info
310
  text_info = gr.Markdown("**Text Analysis**: Enter text to see estimated duration and segment count", visible=True)
311
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  voice_dropdown = gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice)
313
 
314
  with gr.Row():
@@ -361,7 +441,12 @@ async def create_demo():
361
 
362
  generate_btn.click(
363
  fn=tts_interface,
364
- inputs=[text_input, voice_dropdown, rate_slider, volume_slider, pitch_slider],
 
 
 
 
 
365
  outputs=[audio_output, progress_info, status_output, segment_details]
366
  )
367
 
 
9
  import time
10
  from datetime import datetime, timedelta
11
  import logging
12
+ from text_cleaning import TextCleaner
13
 
14
  # Configure logging
15
  logging.basicConfig(
 
120
  logger.info(f"Split text into {len(segments)} segments.")
121
  return segments
122
 
123
+ import io
124
+
125
  async def generate_audio_segment(text_segment, voice_short_name, rate_str, volume_str, pitch_str, segment_index):
126
+ """Generate audio for a single text segment and return as BytesIO"""
127
  logger.info(f"Generating segment {segment_index}...")
128
  communicate = edge_tts.Communicate(text_segment, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str)
129
+
130
+ audio_data = io.BytesIO()
131
+ async for chunk in communicate.stream():
132
+ if chunk["type"] == "audio":
133
+ audio_data.write(chunk["data"])
134
+
135
+ audio_data.seek(0)
136
 
137
  # Verify segment duration
138
  try:
139
+ # Make a copy for verification so we don't consume the main buffer
140
+ verify_buffer = io.BytesIO(audio_data.getvalue())
141
+ seg_audio = AudioSegment.from_mp3(verify_buffer)
142
  duration_min = len(seg_audio) / 1000 / 60
143
+ logger.info(f"Segment {segment_index} generated in memory (Duration: {duration_min:.2f} min)")
144
  except Exception as e:
145
  logger.error(f"Error checking segment {segment_index} duration: {e}")
146
 
147
+ audio_data.seek(0)
148
+ return audio_data
149
 
150
+ async def merge_audio_files(audio_objects):
151
+ """Merge multiple audio BytesIO objects into one file"""
152
+ if not audio_objects:
153
  return None
154
 
155
+ logger.info(f"Merging {len(audio_objects)} audio segments...")
 
156
 
 
157
  # Load and merge audio segments
158
  combined = AudioSegment.empty()
159
+ for i, audio_obj in enumerate(audio_objects):
160
  try:
161
+ audio_obj.seek(0)
162
+ segment = AudioSegment.from_mp3(audio_obj)
163
  combined += segment
164
+ # Explicitly close/clear the BytesIO object to free memory
165
+ audio_obj.close()
166
  except Exception as e:
167
+ logger.error(f"Error merging segment {i+1}: {e}")
 
 
 
 
 
 
168
 
169
+ # Save merged audio to a single temporary file
170
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
171
  merged_path = tmp_file.name
172
  combined.export(merged_path, format="mp3")
 
175
  logger.info(f"Merged audio saved to {merged_path} (Total Duration: {total_duration_min:.2f} min)")
176
  return merged_path
177
 
178
+ async def text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_options=None):
179
  """Generate speech with detailed progress tracking via generator"""
180
  if not text.strip():
181
  yield None, "Please enter text to convert.", None
 
183
  if not voice:
184
  yield None, "Please select a voice.", None
185
  return
186
+
187
+ # Apply text cleaning if enabled
188
+ if cleaning_options and cleaning_options.get('enable_cleaning', False):
189
+ yield 0, "Cleaning text...", None
190
+ # original_text = text # Unused
191
+ text = TextCleaner.clean_text(text, cleaning_options)
192
+
193
+ if cleaning_options.get('save_cleaned', False):
194
+ # Create a filename based on timestamp or first few words
195
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
196
+ filename = f"text_{timestamp}.txt"
197
+ saved_path = TextCleaner.save_cleaned_text(text, filename)
198
+ if saved_path:
199
+ logger.info(f"Saved cleaned text to {saved_path}")
200
+
201
+ if not text.strip():
202
+ yield None, "Text cleaning resulted in empty text.", None
203
+ return
204
+
205
  voice_short_name = voice.split(" - ")[0]
206
  rate_str = f"{rate:+d}%"
207
  volume_str = f"{volume:+d}%"
 
222
 
223
  if total_segments > 1:
224
  # Generate audio for each segment with progress tracking
225
+ audio_objects = []
226
  start_time = time.time()
227
 
228
  for i, segment in enumerate(segments):
 
239
  logger.info(f"Progress: {status_msg.replace(chr(10), ', ')}")
240
  yield progress, status_msg, segment_info
241
 
242
+ # Generate to memory
243
+ audio_obj = await generate_audio_segment(
244
  segment, voice_short_name, rate_str, volume_str, pitch_str, i+1
245
  )
246
+ audio_objects.append(audio_obj)
247
 
248
  yield 90, "Merging audio files...", segment_info
249
 
250
+ # Merge all audio objects
251
+ merged_audio_path = await merge_audio_files(audio_objects)
252
 
253
  yield 100, "Audio generation complete! ✅", segment_info
254
+ yield merged_audio_path, "Done", segment_info
255
  return
256
 
257
  # For short texts or single segment, use original method
 
267
  yield 100, "Audio generation complete! ✅", None
268
  yield tmp_path, "Done", None
269
 
270
+ async def tts_interface(text, voice, rate, volume, pitch,
271
+ enable_cleaning, save_cleaned, clean_urls, clean_html,
272
+ clean_ads, fix_enc, tidy_ws, del_gutenberg,
273
+ del_special, wetext_norm):
274
  """Enhanced TTS interface with detailed progress tracking"""
275
  if not text.strip():
276
  yield None, gr.update(visible=False), "Please enter text.", gr.update(visible=False)
 
279
  yield None, gr.update(visible=False), "Please select a voice.", gr.update(visible=False)
280
  return
281
 
282
+ # Prepare cleaning options
283
+ cleaning_options = {
284
+ 'enable_cleaning': enable_cleaning,
285
+ 'save_cleaned': save_cleaned,
286
+ 'remove_urls': clean_urls,
287
+ 'remove_html': clean_html,
288
+ 'filter_ads': clean_ads,
289
+ 'fix_encoding': fix_enc,
290
+ 'tidy_whitespace': tidy_ws,
291
+ 'remove_gutenberg': del_gutenberg,
292
+ 'remove_special_chars': del_special,
293
+ 'wetext_normalization': wetext_norm
294
+ }
295
+
296
+ # We need to clean text here first to estimate duration correctly?
297
+ # Or let the generator handle it. The generator handles it, but estimation might be off.
298
+ # Ideally we clean first if enabled, then estimate.
299
+
300
+ working_text = text
301
+ if enable_cleaning:
302
+ working_text = TextCleaner.clean_text(text, cleaning_options)
303
+ if save_cleaned:
304
+ # We'll let the generator save it to avoid double saving or complex logic here,
305
+ # but we need to pass the options.
306
+ pass
307
+
308
+ estimated_duration = estimate_text_duration(working_text)
309
 
310
  # Reset UI
311
  yield None, gr.update(value="Starting...", visible=True), "Initializing...", gr.update(visible=False)
312
 
313
+ async for result in text_to_speech_generator(text, voice, rate, volume, pitch, cleaning_options):
314
  if isinstance(result, tuple) and len(result) == 3:
315
  # Progress update
316
  progress_val, status_msg, segment_info = result
 
362
  # Add text analysis info
363
  text_info = gr.Markdown("**Text Analysis**: Enter text to see estimated duration and segment count", visible=True)
364
 
365
+ with gr.Accordion("Text Cleaning Settings", open=True):
366
+ with gr.Row():
367
+ enable_cleaning = gr.Checkbox(label="Enable Text Cleaning", value=True)
368
+ save_cleaned = gr.Checkbox(label="Save Cleaned Text File", value=True)
369
+
370
+ with gr.Group(visible=True) as cleaning_options_group:
371
+ with gr.Row():
372
+ clean_urls = gr.Checkbox(label="Remove URLs", value=True)
373
+ clean_html = gr.Checkbox(label="Remove HTML", value=True)
374
+
375
+ with gr.Row():
376
+ clean_ads = gr.Checkbox(label="Filter Ads", value=True)
377
+ fix_enc = gr.Checkbox(label="Fix Encoding", value=True)
378
+
379
+ with gr.Row():
380
+ tidy_ws = gr.Checkbox(label="Tidy Whitespace", value=True)
381
+ del_gutenberg = gr.Checkbox(label="Remove Project Gutenberg", value=True)
382
+
383
+ with gr.Row():
384
+ del_special = gr.Checkbox(label="Remove Special Characters", value=True)
385
+ wetext_norm = gr.Checkbox(label="Enable WeText Normalization", value=True)
386
+
387
+ def toggle_options(enabled):
388
+ return gr.update(visible=enabled)
389
+
390
+ enable_cleaning.change(fn=toggle_options, inputs=[enable_cleaning], outputs=[cleaning_options_group])
391
+
392
  voice_dropdown = gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice)
393
 
394
  with gr.Row():
 
441
 
442
  generate_btn.click(
443
  fn=tts_interface,
444
+ inputs=[
445
+ text_input, voice_dropdown, rate_slider, volume_slider, pitch_slider,
446
+ enable_cleaning, save_cleaned, clean_urls, clean_html,
447
+ clean_ads, fix_enc, tidy_ws, del_gutenberg,
448
+ del_special, wetext_norm
449
+ ],
450
  outputs=[audio_output, progress_info, status_output, segment_details]
451
  )
452