hivecorp commited on
Commit
49eff30
·
verified ·
1 Parent(s): 411d260

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -280
app.py CHANGED
@@ -72,7 +72,8 @@ class TextProcessor:
72
 
73
  # Adjust for punctuation density
74
  punct_count = sum(text.count(p) for p in self.punctuation_weights.keys())
75
- complexity *= (1 + (punct_count / len(words)) * 0.5)
 
76
 
77
  return complexity
78
 
@@ -111,38 +112,39 @@ class TextProcessor:
111
  text = re.sub(r'([.!?,;:])\s*', r'\1 ', text)
112
  text = re.sub(r'\s+([.!?,;:])', r'\1', text)
113
 
114
- # First, split into major segments by strong punctuation
115
  segments = []
116
- current_segment = []
117
- current_text = ""
118
  words = text.split()
119
 
120
  i = 0
121
  while i < len(words):
122
- complexity = self.analyze_sentence_complexity(' '.join(words[i:i + self.words_per_line * 2]))
123
- breaks = self.find_natural_breaks(' '.join(words[i:i + int(self.max_segment_words * complexity)]))
 
 
 
124
 
125
- # Find best break point
126
- best_break = None
127
- best_weight = 0
 
 
128
 
129
  for break_idx, weight in breaks:
130
- actual_idx = i + break_idx
131
- if (actual_idx - i >= self.min_segment_words and
132
- actual_idx - i <= self.max_segment_words):
133
- if weight > best_weight:
134
- best_break = break_idx
135
- best_weight = weight
 
136
 
137
- if best_break is None:
138
- # If no good break found, use maximum length
139
- best_break = min(self.words_per_line * self.lines_per_segment, len(words) - i)
140
 
141
- # Create segment
142
- segment_words = words[i:i + best_break + 1]
143
  segment_text = ' '.join(segment_words)
144
 
145
- # Split segment into lines
146
  lines = self.split_into_lines(segment_text)
147
  final_segment_text = '\n'.join(lines)
148
 
@@ -166,7 +168,6 @@ class TextProcessor:
166
  current_line.append(word)
167
  word_count += 1
168
 
169
- # Check for natural line breaks
170
  is_break = (
171
  word_count >= self.words_per_line or
172
  any(word.endswith(p) for p in '.!?') or
@@ -174,7 +175,7 @@ class TextProcessor:
174
  any(word.endswith(p) for p in ',;:'))
175
  )
176
 
177
- if is_break:
178
  lines.append(' '.join(current_line))
179
  current_line = []
180
  word_count = 0
@@ -184,16 +185,15 @@ class TextProcessor:
184
 
185
  return lines
186
 
187
- # IMPROVEMENT 1: Enhanced Error Handling
188
  class TTSError(Exception):
189
  """Custom exception for TTS processing errors"""
190
  pass
191
 
192
  async def process_segment_with_timing(segment: Segment, voice: str, rate: str, pitch: str) -> Segment:
193
  """Process a complete segment as a single TTS unit with improved error handling"""
194
- audio_file = os.path.join(tempfile.gettempdir(), f"temp_segment_{segment.id}_{uuid.uuid4()}.wav")
 
195
  try:
196
- # Process the entire segment text as one unit, replacing newlines with spaces
197
  segment_text = ' '.join(segment.text.split('\n'))
198
  tts = edge_tts.Communicate(segment_text, voice, rate=rate, pitch=pitch)
199
 
@@ -207,7 +207,6 @@ async def process_segment_with_timing(segment: Segment, voice: str, rate: str, p
207
 
208
  try:
209
  segment.audio = AudioSegment.from_file(audio_file)
210
- # Reduced silence to 30ms for more natural flow
211
  silence = AudioSegment.silent(duration=30)
212
  segment.audio = silence + segment.audio + silence
213
  segment.duration = len(segment.audio)
@@ -224,20 +223,15 @@ async def process_segment_with_timing(segment: Segment, voice: str, rate: str, p
224
  try:
225
  os.remove(audio_file)
226
  except Exception:
227
- pass # Ignore deletion errors
228
 
229
- # IMPROVEMENT 2: Better File Management with cleanup
230
  class FileManager:
231
  """Manages temporary and output files with cleanup capabilities"""
232
  def __init__(self):
233
  self.temp_dir = tempfile.mkdtemp(prefix="tts_app_")
234
  self.output_files = []
235
- self.max_files_to_keep = 5 # Keep only the 5 most recent output pairs
236
 
237
- def get_temp_path(self, prefix):
238
- """Get a path for a temporary file"""
239
- return os.path.join(self.temp_dir, f"{prefix}_{uuid.uuid4()}")
240
-
241
  def create_output_paths(self):
242
  """Create paths for output files"""
243
  unique_id = str(uuid.uuid4())
@@ -252,276 +246,164 @@ class FileManager:
252
  def cleanup_old_files(self):
253
  """Clean up old output files, keeping only the most recent ones"""
254
  if len(self.output_files) > self.max_files_to_keep:
255
- old_files = self.output_files[:-self.max_files_to_keep]
256
- for srt_path, audio_path in old_files:
257
  try:
258
- if os.path.exists(srt_path):
259
- os.remove(srt_path)
260
- if os.path.exists(audio_path):
261
- os.remove(audio_path)
262
  except Exception:
263
- pass # Ignore deletion errors
264
-
265
- # Update the list to only include files we're keeping
266
  self.output_files = self.output_files[-self.max_files_to_keep:]
267
 
268
  def cleanup_all(self):
269
  """Clean up all managed files"""
270
  for srt_path, audio_path in self.output_files:
271
  try:
272
- if os.path.exists(srt_path):
273
- os.remove(srt_path)
274
- if os.path.exists(audio_path):
275
- os.remove(audio_path)
276
  except Exception:
277
- pass # Ignore deletion errors
278
-
279
  try:
280
- os.rmdir(self.temp_dir)
281
  except Exception:
282
- pass # Ignore if directory isn't empty or can't be removed
283
 
284
- # Create global file manager
285
  file_manager = FileManager()
286
 
287
- # IMPROVEMENT 3: Parallel Processing for Segments
288
  async def generate_accurate_srt(
289
- text: str,
290
- voice: str,
291
- rate: str,
292
- pitch: str,
293
- words_per_line: int,
294
- lines_per_segment: int,
295
- progress_callback=None,
296
- parallel: bool = True,
297
- max_workers: int = 4
298
  ) -> Tuple[str, str]:
299
  """Generate accurate SRT with parallel processing option"""
300
  processor = TextProcessor(words_per_line, lines_per_segment)
301
  segments = processor.split_into_segments(text)
302
-
303
  total_segments = len(segments)
304
- processed_segments = []
305
 
306
- # Update progress to show segmentation is complete
307
  if progress_callback:
308
  progress_callback(0.1, "Text segmentation complete")
309
 
 
310
  if parallel and total_segments > 1:
311
- # Process segments in parallel
312
- processed_count = 0
313
- segment_tasks = []
314
-
315
- # Create a semaphore to limit concurrent tasks
316
  semaphore = asyncio.Semaphore(max_workers)
 
317
 
318
  async def process_with_semaphore(segment):
319
  async with semaphore:
320
  nonlocal processed_count
321
- try:
322
- result = await process_segment_with_timing(segment, voice, rate, pitch)
323
- processed_count += 1
324
- if progress_callback:
325
- progress = 0.1 + (0.8 * processed_count / total_segments)
326
- progress_callback(progress, f"Processed {processed_count}/{total_segments} segments")
327
- return result
328
- except Exception as e:
329
- # Handle errors in individual segments
330
- processed_count += 1
331
- if progress_callback:
332
- progress = 0.1 + (0.8 * processed_count / total_segments)
333
- progress_callback(progress, f"Error in segment {segment.id}: {str(e)}")
334
- raise
335
 
336
- # Create tasks for all segments
337
- for segment in segments:
338
- segment_tasks.append(process_with_semaphore(segment))
339
 
340
- # Run all tasks and collect results
341
- try:
342
- processed_segments = await asyncio.gather(*segment_tasks)
343
- except Exception as e:
344
- if progress_callback:
345
- progress_callback(0.9, f"Error during parallel processing: {str(e)}")
346
- raise TTSError(f"Failed during parallel processing: {str(e)}")
347
  else:
348
- # Process segments sequentially (original method)
349
  for i, segment in enumerate(segments):
350
- try:
351
- processed_segment = await process_segment_with_timing(segment, voice, rate, pitch)
352
- processed_segments.append(processed_segment)
353
-
354
- if progress_callback:
355
- progress = 0.1 + (0.8 * (i + 1) / total_segments)
356
- progress_callback(progress, f"Processed {i + 1}/{total_segments} segments")
357
- except Exception as e:
358
- if progress_callback:
359
- progress_callback(0.9, f"Error processing segment {segment.id}: {str(e)}")
360
- raise TTSError(f"Failed to process segment {segment.id}: {str(e)}")
361
 
362
- # Sort segments by ID to ensure correct order
363
  processed_segments.sort(key=lambda s: s.id)
364
-
365
  if progress_callback:
366
  progress_callback(0.9, "Finalizing audio and subtitles")
367
 
368
- # Now combine the segments in the correct order
369
  current_time = 0
370
  final_audio = AudioSegment.empty()
371
  srt_content = ""
372
-
373
  for segment in processed_segments:
374
- # Calculate precise timing
375
  segment.start_time = current_time
376
  segment.end_time = current_time + segment.duration
377
-
378
- # Add to SRT with precise timing
379
- srt_content += (
380
- f"{segment.id}\n"
381
- f"{format_time_ms(segment.start_time)} --> {format_time_ms(segment.end_time)}\n"
382
- f"{segment.text}\n\n"
383
- )
384
-
385
- # Add to final audio with precise positioning
386
  final_audio = final_audio.append(segment.audio, crossfade=0)
387
-
388
- # Update timing with precise gap
389
  current_time = segment.end_time
390
 
391
- # Export with high precision
392
  srt_path, audio_path = file_manager.create_output_paths()
393
-
394
  try:
395
- # Export with optimized quality settings and compression
396
- export_params = {
397
- 'format': 'mp3',
398
- 'bitrate': '192k', # Reduced from 320k but still high quality
399
- 'parameters': [
400
- '-ar', '44100', # Standard sample rate
401
- '-ac', '2', # Stereo
402
- '-compression_level', '0', # Best compression
403
- '-qscale:a', '2' # High quality VBR encoding
404
- ]
405
- }
406
  final_audio.export(audio_path, **export_params)
407
-
408
- with open(srt_path, "w", encoding='utf-8') as f:
409
- f.write(srt_content)
410
  except Exception as e:
411
- if progress_callback:
412
- progress_callback(1.0, f"Error exporting final files: {str(e)}")
413
  raise TTSError(f"Failed to export final files: {str(e)}")
414
 
415
  if progress_callback:
416
  progress_callback(1.0, "Complete!")
417
-
418
  return srt_path, audio_path
419
 
420
- # IMPROVEMENT 4: Progress Reporting with proper error handling for older Gradio versions
421
  async def process_text_with_progress(
422
- text,
423
- pitch,
424
- rate,
425
- voice,
426
- words_per_line,
427
- lines_per_segment,
428
- parallel_processing,
429
  progress=gr.Progress()
430
  ):
431
- # Input validation
 
 
 
432
  if not text or text.strip() == "":
433
- return None, None, None, gr.update(value="", visible=True), gr.update(value="", visible=False), "Please enter some text to convert to speech."
434
 
435
- # Format pitch and rate strings
436
  pitch_str = f"{pitch:+d}Hz" if pitch != 0 else "+0Hz"
437
  rate_str = f"{rate:+d}%" if rate != 0 else "+0%"
438
 
439
  try:
440
- # Start progress tracking
441
  progress(0, "Preparing text...")
442
 
443
  def update_progress(value, status):
444
  progress(value, status)
445
 
446
  srt_path, audio_path = await generate_accurate_srt(
447
- text,
448
- voice_options[voice],
449
- rate_str,
450
- pitch_str,
451
- words_per_line,
452
- lines_per_segment,
453
  progress_callback=update_progress,
454
  parallel=parallel_processing
455
  )
456
 
457
- # Generate Markdown links for download that open in a new tab
458
- srt_download_link = f'<a href="file={srt_path}" download="subtitles.srt" target="_blank">Download SRT</a>'
459
- audio_download_link = f'<a href="file={audio_path}" download="audio.mp3" target="_blank">Download Audio</a>'
460
-
461
- # Return the paths for gr.Audio and Markdown for download links
462
- return (
463
- audio_path,
464
- gr.update(value=srt_download_link, visible=True), # Use gr.Markdown for SRT download
465
- gr.update(value=audio_download_link, visible=True), # Use gr.Markdown for Audio download
466
- gr.update(value="", visible=False), # Hide error message
467
- "" # Clear error message
468
- )
469
  except TTSError as e:
470
- # Return specific TTS error
471
- return None, gr.update(value="", visible=False), gr.update(value="", visible=False), gr.update(value=f"TTS Error: {str(e)}", visible=True), f"TTS Error: {str(e)}"
472
  except Exception as e:
473
  # Return any other error
474
- return None, gr.update(value="", visible=False), gr.update(value="", visible=False), gr.update(value=f"Unexpected error: {str(e)}", visible=True), f"Unexpected error: {str(e)}"
475
-
476
 
477
  # Voice options dictionary
478
  voice_options = {
479
- "Andrew Male": "en-US-AndrewNeural",
480
- "Jenny Female": "en-US-JennyNeural",
481
- "Guy Male": "en-US-GuyNeural",
482
- "Ana Female": "en-US-AnaNeural",
483
- "Aria Female": "en-US-AriaNeural",
484
- "Brian Male": "en-US-BrianNeural",
485
- "Christopher Male": "en-US-ChristopherNeural",
486
- "Eric Male": "en-US-EricNeural",
487
- "Michelle Male": "en-US-MichelleNeural",
488
- "Roger Male": "en-US-RogerNeural",
489
- "Natasha Female": "en-AU-NatashaNeural",
490
- "William Male": "en-AU-WilliamNeural",
491
- "Clara Female": "en-CA-ClaraNeural",
492
- "Liam Female ": "en-CA-LiamNeural",
493
- "Libby Female": "en-GB-LibbyNeural",
494
- "Maisie": "en-GB-MaisieNeural",
495
- "Ryan": "en-GB-RyanNeural",
496
- "Sonia": "en-GB-SoniaNeural",
497
- "Thomas": "en-GB-ThomasNeural",
498
- "Sam": "en-HK-SamNeural",
499
- "Yan": "en-HK-YanNeural",
500
- "Connor": "en-IE-ConnorNeural",
501
- "Emily": "en-IE-EmilyNeural",
502
- "Neerja": "en-IN-NeerjaNeural",
503
- "Prabhat": "en-IN-PrabhatNeural",
504
- "Asilia": "en-KE-AsiliaNeural",
505
- "Chilemba": "en-KE-ChilembaNeural",
506
- "Abeo": "en-NG-AbeoNeural",
507
- "Ezinne": "en-NG-EzinneNeural",
508
- "Mitchell": "en-NZ-MitchellNeural",
509
- "James": "en-PH-JamesNeural",
510
- "Rosa": "en-PH-RosaNeural",
511
- "Luna": "en-SG-LunaNeural",
512
- "Wayne": "en-SG-WayneNeural",
513
- "Elimu": "en-TZ-ElimuNeural",
514
- "Imani": "en-TZ-ImaniNeural",
515
- "Leah": "en-ZA-LeahNeural",
516
- "Luke": "en-ZA-LukeNeural"
517
- # Add other voices as needed
518
  }
519
 
520
- # Register cleanup on exit
521
  import atexit
522
  atexit.register(file_manager.cleanup_all)
523
 
524
- # Create Gradio interface
525
  with gr.Blocks(title="Advanced TTS with Configurable SRT Generation") as app:
526
  gr.Markdown("# Advanced TTS with Configurable SRT Generation")
527
  gr.Markdown("Generate perfectly synchronized audio and subtitles with natural speech patterns.")
@@ -529,85 +411,41 @@ with gr.Blocks(title="Advanced TTS with Configurable SRT Generation") as app:
529
  with gr.Row():
530
  with gr.Column(scale=3):
531
  text_input = gr.Textbox(label="Enter Text", lines=10, placeholder="Enter your text here...")
532
-
533
  with gr.Column(scale=2):
534
- voice_dropdown = gr.Dropdown(
535
- label="Select Voice",
536
- choices=list(voice_options.keys()),
537
- value="Jenny Female"
538
- )
539
- pitch_slider = gr.Slider(
540
- label="Pitch Adjustment (Hz)",
541
- minimum=-10,
542
- maximum=10,
543
- value=0,
544
- step=1
545
- )
546
- rate_slider = gr.Slider(
547
- label="Rate Adjustment (%)",
548
- minimum=-25,
549
- maximum=25,
550
- value=0,
551
- step=1
552
- )
553
 
554
  with gr.Row():
555
  with gr.Column():
556
- words_per_line = gr.Slider(
557
- label="Words per Line",
558
- minimum=3,
559
- maximum=12,
560
- value=6,
561
- step=1,
562
- info="Controls how many words appear on each line of the subtitle"
563
- )
564
  with gr.Column():
565
- lines_per_segment = gr.Slider(
566
- label="Lines per Segment",
567
- minimum=1,
568
- maximum=4,
569
- value=2,
570
- step=1,
571
- info="Controls how many lines appear in each subtitle segment"
572
- )
573
  with gr.Column():
574
- parallel_processing = gr.Checkbox(
575
- label="Enable Parallel Processing",
576
- value=True,
577
- info="Process multiple segments simultaneously for faster conversion (recommended for longer texts)"
578
- )
579
-
580
- submit_btn = gr.Button("Generate Audio & Subtitles")
581
 
582
- # Add error message component
583
- error_output = gr.Textbox(label="Status", visible=False)
584
 
 
585
  with gr.Row():
586
- with gr.Column():
587
- audio_output = gr.Audio(label="Preview Audio")
588
- with gr.Column():
589
- # Change gr.File to gr.Markdown for download links
590
- srt_download_link = gr.Markdown(value="", visible=False, label="Download SRT")
591
- audio_download_link = gr.Markdown(value="", visible=False, label="Download Audio")
592
 
593
- # Handle button click with manual error handling instead of .catch()
594
  submit_btn.click(
595
  fn=process_text_with_progress,
596
  inputs=[
597
- text_input,
598
- pitch_slider,
599
- rate_slider,
600
- voice_dropdown,
601
- words_per_line,
602
- lines_per_segment,
603
- parallel_processing
604
  ],
605
  outputs=[
606
- audio_output,
607
- srt_download_link, # Output to Markdown component
608
- audio_download_link, # Output to Markdown component
609
- error_output,
610
- error_output
611
  ],
612
  api_name="generate"
613
  )
 
72
 
73
  # Adjust for punctuation density
74
  punct_count = sum(text.count(p) for p in self.punctuation_weights.keys())
75
+ if len(words) > 0:
76
+ complexity *= (1 + (punct_count / len(words)) * 0.5)
77
 
78
  return complexity
79
 
 
112
  text = re.sub(r'([.!?,;:])\s*', r'\1 ', text)
113
  text = re.sub(r'\s+([.!?,;:])', r'\1', text)
114
 
 
115
  segments = []
 
 
116
  words = text.split()
117
 
118
  i = 0
119
  while i < len(words):
120
+ # Dynamically select a chunk to analyze for breaks
121
+ chunk_end = i + int(self.max_segment_words)
122
+ chunk_text = ' '.join(words[i:chunk_end])
123
+ complexity = self.analyze_sentence_complexity(chunk_text)
124
+ breaks = self.find_natural_breaks(chunk_text)
125
 
126
+ best_break = -1
127
+ best_weight = -1
128
+
129
+ # Find the best break point within the ideal segment length
130
+ ideal_length = self.words_per_line * self.lines_per_segment
131
 
132
  for break_idx, weight in breaks:
133
+ # Prioritize breaks closer to the ideal length
134
+ distance_penalty = 1 - (abs(break_idx - ideal_length) / ideal_length) * 0.5
135
+ score = weight * distance_penalty
136
+
137
+ if score > best_weight:
138
+ best_break = break_idx
139
+ best_weight = score
140
 
141
+ if best_break == -1:
142
+ # If no break found, split at the ideal length or end of text
143
+ best_break = min(ideal_length, len(words) - 1 - i)
144
 
145
+ segment_words = words[i : i + best_break + 1]
 
146
  segment_text = ' '.join(segment_words)
147
 
 
148
  lines = self.split_into_lines(segment_text)
149
  final_segment_text = '\n'.join(lines)
150
 
 
168
  current_line.append(word)
169
  word_count += 1
170
 
 
171
  is_break = (
172
  word_count >= self.words_per_line or
173
  any(word.endswith(p) for p in '.!?') or
 
175
  any(word.endswith(p) for p in ',;:'))
176
  )
177
 
178
+ if is_break and len(words) > word_count:
179
  lines.append(' '.join(current_line))
180
  current_line = []
181
  word_count = 0
 
185
 
186
  return lines
187
 
 
188
  class TTSError(Exception):
189
  """Custom exception for TTS processing errors"""
190
  pass
191
 
192
  async def process_segment_with_timing(segment: Segment, voice: str, rate: str, pitch: str) -> Segment:
193
  """Process a complete segment as a single TTS unit with improved error handling"""
194
+ temp_dir = tempfile.gettempdir()
195
+ audio_file = os.path.join(temp_dir, f"temp_segment_{segment.id}_{uuid.uuid4()}.wav")
196
  try:
 
197
  segment_text = ' '.join(segment.text.split('\n'))
198
  tts = edge_tts.Communicate(segment_text, voice, rate=rate, pitch=pitch)
199
 
 
207
 
208
  try:
209
  segment.audio = AudioSegment.from_file(audio_file)
 
210
  silence = AudioSegment.silent(duration=30)
211
  segment.audio = silence + segment.audio + silence
212
  segment.duration = len(segment.audio)
 
223
  try:
224
  os.remove(audio_file)
225
  except Exception:
226
+ pass
227
 
 
228
  class FileManager:
229
  """Manages temporary and output files with cleanup capabilities"""
230
  def __init__(self):
231
  self.temp_dir = tempfile.mkdtemp(prefix="tts_app_")
232
  self.output_files = []
233
+ self.max_files_to_keep = 5
234
 
 
 
 
 
235
  def create_output_paths(self):
236
  """Create paths for output files"""
237
  unique_id = str(uuid.uuid4())
 
246
  def cleanup_old_files(self):
247
  """Clean up old output files, keeping only the most recent ones"""
248
  if len(self.output_files) > self.max_files_to_keep:
249
+ old_files_to_remove = self.output_files[:-self.max_files_to_keep]
250
+ for srt_path, audio_path in old_files_to_remove:
251
  try:
252
+ if os.path.exists(srt_path): os.remove(srt_path)
253
+ if os.path.exists(audio_path): os.remove(audio_path)
 
 
254
  except Exception:
255
+ pass
 
 
256
  self.output_files = self.output_files[-self.max_files_to_keep:]
257
 
258
  def cleanup_all(self):
259
  """Clean up all managed files"""
260
  for srt_path, audio_path in self.output_files:
261
  try:
262
+ if os.path.exists(srt_path): os.remove(srt_path)
263
+ if os.path.exists(audio_path): os.remove(audio_path)
 
 
264
  except Exception:
265
+ pass
 
266
  try:
267
+ if os.path.exists(self.temp_dir): os.rmdir(self.temp_dir)
268
  except Exception:
269
+ pass
270
 
 
271
  file_manager = FileManager()
272
 
 
273
  async def generate_accurate_srt(
274
+ text: str, voice: str, rate: str, pitch: str,
275
+ words_per_line: int, lines_per_segment: int,
276
+ progress_callback=None, parallel: bool = True, max_workers: int = 4
 
 
 
 
 
 
277
  ) -> Tuple[str, str]:
278
  """Generate accurate SRT with parallel processing option"""
279
  processor = TextProcessor(words_per_line, lines_per_segment)
280
  segments = processor.split_into_segments(text)
 
281
  total_segments = len(segments)
 
282
 
 
283
  if progress_callback:
284
  progress_callback(0.1, "Text segmentation complete")
285
 
286
+ processed_segments = []
287
  if parallel and total_segments > 1:
 
 
 
 
 
288
  semaphore = asyncio.Semaphore(max_workers)
289
+ processed_count = 0
290
 
291
  async def process_with_semaphore(segment):
292
  async with semaphore:
293
  nonlocal processed_count
294
+ result = await process_segment_with_timing(segment, voice, rate, pitch)
295
+ processed_count += 1
296
+ if progress_callback:
297
+ progress = 0.1 + (0.8 * processed_count / total_segments)
298
+ progress_callback(progress, f"Processed {processed_count}/{total_segments} segments")
299
+ return result
 
 
 
 
 
 
 
 
300
 
301
+ tasks = [process_with_semaphore(s) for s in segments]
302
+ results = await asyncio.gather(*tasks, return_exceptions=True)
 
303
 
304
+ for res in results:
305
+ if isinstance(res, Exception):
306
+ raise TTSError(f"A task failed during parallel processing: {res}")
307
+ processed_segments.append(res)
 
 
 
308
  else:
 
309
  for i, segment in enumerate(segments):
310
+ processed_segment = await process_segment_with_timing(segment, voice, rate, pitch)
311
+ processed_segments.append(processed_segment)
312
+ if progress_callback:
313
+ progress = 0.1 + (0.8 * (i + 1) / total_segments)
314
+ progress_callback(progress, f"Processed {i + 1}/{total_segments} segments")
 
 
 
 
 
 
315
 
 
316
  processed_segments.sort(key=lambda s: s.id)
 
317
  if progress_callback:
318
  progress_callback(0.9, "Finalizing audio and subtitles")
319
 
 
320
  current_time = 0
321
  final_audio = AudioSegment.empty()
322
  srt_content = ""
 
323
  for segment in processed_segments:
 
324
  segment.start_time = current_time
325
  segment.end_time = current_time + segment.duration
326
+ srt_content += f"{segment.id}\n{format_time_ms(segment.start_time)} --> {format_time_ms(segment.end_time)}\n{segment.text}\n\n"
 
 
 
 
 
 
 
 
327
  final_audio = final_audio.append(segment.audio, crossfade=0)
 
 
328
  current_time = segment.end_time
329
 
 
330
  srt_path, audio_path = file_manager.create_output_paths()
 
331
  try:
332
+ export_params = {'format': 'mp3', 'bitrate': '192k', 'parameters': ['-ar', '44100', '-ac', '2', '-qscale:a', '2']}
 
 
 
 
 
 
 
 
 
 
333
  final_audio.export(audio_path, **export_params)
334
+ with open(srt_path, "w", encoding='utf-8') as f: f.write(srt_content)
 
 
335
  except Exception as e:
 
 
336
  raise TTSError(f"Failed to export final files: {str(e)}")
337
 
338
  if progress_callback:
339
  progress_callback(1.0, "Complete!")
 
340
  return srt_path, audio_path
341
 
 
342
  async def process_text_with_progress(
343
+ text, pitch, rate, voice, words_per_line,
344
+ lines_per_segment, parallel_processing,
 
 
 
 
 
345
  progress=gr.Progress()
346
  ):
347
+ """
348
+ Processes the text, generates audio and SRT, and returns paths and HTML links.
349
+ The returned links are configured to open in a new browser tab.
350
+ """
351
  if not text or text.strip() == "":
352
+ return None, "", True, "Please enter some text to convert to speech."
353
 
 
354
  pitch_str = f"{pitch:+d}Hz" if pitch != 0 else "+0Hz"
355
  rate_str = f"{rate:+d}%" if rate != 0 else "+0%"
356
 
357
  try:
 
358
  progress(0, "Preparing text...")
359
 
360
  def update_progress(value, status):
361
  progress(value, status)
362
 
363
  srt_path, audio_path = await generate_accurate_srt(
364
+ text, voice_options[voice], rate_str, pitch_str,
365
+ words_per_line, lines_per_segment,
 
 
 
 
366
  progress_callback=update_progress,
367
  parallel=parallel_processing
368
  )
369
 
370
+ # MODIFICATION: Create HTML for download links that open in a new tab
371
+ download_html = f"""
372
+ <div style="text-align: center; padding-top: 10px;">
373
+ <a href="/file={srt_path}" target="_blank" download="subtitles.srt" style="font-weight: 600; color: #0b5ed7; text-decoration: none; margin-right: 20px;">📥 Download SRT File</a>
374
+ <a href="/file={audio_path}" target="_blank" download="audio.mp3" style="font-weight: 600; color: #0b5ed7; text-decoration: none;">📥 Download Audio File</a>
375
+ </div>
376
+ """
377
+
378
+ # MODIFICATION: Return audio preview path, HTML links, and hide error
379
+ return audio_path, download_html, False, ""
 
 
380
  except TTSError as e:
381
+ # Return specific TTS error, clearing the audio preview and download links
382
+ return None, "", True, f"TTS Error: {str(e)}"
383
  except Exception as e:
384
  # Return any other error
385
+ return None, "", True, f"Unexpected error: {str(e)}"
 
386
 
387
  # Voice options dictionary
388
  voice_options = {
389
+ "Andrew Male": "en-US-AndrewNeural", "Jenny Female": "en-US-JennyNeural", "Guy Male": "en-US-GuyNeural",
390
+ "Ana Female": "en-US-AnaNeural", "Aria Female": "en-US-AriaNeural", "Brian Male": "en-US-BrianNeural",
391
+ "Christopher Male": "en-US-ChristopherNeural", "Eric Male": "en-US-EricNeural", "Michelle Male": "en-US-MichelleNeural",
392
+ "Roger Male": "en-US-RogerNeural", "Natasha Female": "en-AU-NatashaNeural", "William Male": "en-AU-WilliamNeural",
393
+ "Clara Female": "en-CA-ClaraNeural", "Liam Female ": "en-CA-LiamNeural", "Libby Female": "en-GB-LibbyNeural",
394
+ "Maisie": "en-GB-MaisieNeural", "Ryan": "en-GB-RyanNeural", "Sonia": "en-GB-SoniaNeural",
395
+ "Thomas": "en-GB-ThomasNeural", "Sam": "en-HK-SamNeural", "Yan": "en-HK-YanNeural",
396
+ "Connor": "en-IE-ConnorNeural", "Emily": "en-IE-EmilyNeural", "Neerja": "en-IN-NeerjaNeural",
397
+ "Prabhat": "en-IN-PrabhatNeural", "Asilia": "en-KE-AsiliaNeural", "Chilemba": "en-KE-ChilembaNeural",
398
+ "Abeo": "en-NG-AbeoNeural", "Ezinne": "en-NG-EzinneNeural", "Mitchell": "en-NZ-MitchellNeural",
399
+ "James": "en-PH-JamesNeural", "Rosa": "en-PH-RosaNeural", "Luna": "en-SG-LunaNeural",
400
+ "Wayne": "en-SG-WayneNeural", "Elimu": "en-TZ-ElimuNeural", "Imani": "en-TZ-ImaniNeural",
401
+ "Leah": "en-ZA-LeahNeural", "Luke": "en-ZA-LukeNeural"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
  }
403
 
 
404
  import atexit
405
  atexit.register(file_manager.cleanup_all)
406
 
 
407
  with gr.Blocks(title="Advanced TTS with Configurable SRT Generation") as app:
408
  gr.Markdown("# Advanced TTS with Configurable SRT Generation")
409
  gr.Markdown("Generate perfectly synchronized audio and subtitles with natural speech patterns.")
 
411
  with gr.Row():
412
  with gr.Column(scale=3):
413
  text_input = gr.Textbox(label="Enter Text", lines=10, placeholder="Enter your text here...")
 
414
  with gr.Column(scale=2):
415
+ voice_dropdown = gr.Dropdown(label="Select Voice", choices=list(voice_options.keys()), value="Jenny Female")
416
+ pitch_slider = gr.Slider(label="Pitch Adjustment (Hz)", minimum=-10, maximum=10, value=0, step=1)
417
+ rate_slider = gr.Slider(label="Rate Adjustment (%)", minimum=-25, maximum=25, value=0, step=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
 
419
  with gr.Row():
420
  with gr.Column():
421
+ words_per_line = gr.Slider(label="Words per Line", minimum=3, maximum=12, value=6, step=1, info="Words per subtitle line.")
 
 
 
 
 
 
 
422
  with gr.Column():
423
+ lines_per_segment = gr.Slider(label="Lines per Segment", minimum=1, maximum=4, value=2, step=1, info="Lines per subtitle block.")
 
 
 
 
 
 
 
424
  with gr.Column():
425
+ parallel_processing = gr.Checkbox(label="Enable Parallel Processing", value=True, info="Faster conversion for longer texts.")
 
 
 
 
 
 
426
 
427
+ submit_btn = gr.Button("Generate Audio & Subtitles", variant="primary")
428
+ error_output = gr.Textbox(label="Status", visible=False, interactive=False)
429
 
430
+ # MODIFICATION: Changed the output area
431
  with gr.Row():
432
+ with gr.Column(scale=2):
433
+ audio_preview = gr.Audio(label="Preview Audio")
434
+ with gr.Column(scale=1):
435
+ download_links_output = gr.HTML(label="Download Files")
 
 
436
 
437
+ # MODIFICATION: Updated the .click() event outputs
438
  submit_btn.click(
439
  fn=process_text_with_progress,
440
  inputs=[
441
+ text_input, pitch_slider, rate_slider, voice_dropdown,
442
+ words_per_line, lines_per_segment, parallel_processing
 
 
 
 
 
443
  ],
444
  outputs=[
445
+ audio_preview, # Output for the audio player
446
+ download_links_output, # Output for the HTML download links
447
+ error_output, # First update to error_output (visibility)
448
+ error_output # Second update to error_output (value)
 
449
  ],
450
  api_name="generate"
451
  )