hivecorp commited on
Commit
071143c
·
verified ·
1 Parent(s): f2939e1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +129 -562
app.py CHANGED
@@ -11,6 +11,7 @@ from concurrent.futures import ThreadPoolExecutor
11
  from typing import List, Tuple, Optional, Dict, Any
12
  import math
13
  from dataclasses import dataclass
 
14
 
15
  class TimingManager:
16
  def __init__(self):
@@ -41,80 +42,59 @@ class Segment:
41
  end_time: int = 0
42
  duration: int = 0
43
  audio: Optional[AudioSegment] = None
44
- lines: List[str] = None # Add lines field for display purposes only
45
 
46
  class TextProcessor:
47
  def __init__(self, words_per_line: int, lines_per_segment: int):
48
  self.words_per_line = words_per_line
49
  self.lines_per_segment = lines_per_segment
50
  self.min_segment_words = 3
51
- self.max_segment_words = words_per_line * lines_per_segment * 1.5 # Allow 50% more for natural breaks
52
  self.punctuation_weights = {
53
- '.': 1.0, # Strong break
54
  '!': 1.0,
55
  '?': 1.0,
56
- ';': 0.8, # Medium-strong break
57
  ':': 0.7,
58
- ',': 0.5, # Medium break
59
- '-': 0.3, # Weak break
60
  '(': 0.2,
61
  ')': 0.2
62
  }
63
 
64
  def analyze_sentence_complexity(self, text: str) -> float:
65
- """Analyze sentence complexity to determine optimal segment length"""
66
  words = text.split()
67
  complexity = 1.0
68
-
69
- # Adjust for sentence length
70
  if len(words) > self.words_per_line * 2:
71
  complexity *= 1.2
72
-
73
- # Adjust for punctuation density
74
  punct_count = sum(text.count(p) for p in self.punctuation_weights.keys())
75
  complexity *= (1 + (punct_count / len(words)) * 0.5)
76
-
77
  return complexity
78
 
79
  def find_natural_breaks(self, text: str) -> List[Tuple[int, float]]:
80
- """Find natural break points with their weights"""
81
  breaks = []
82
  words = text.split()
83
-
84
  for i, word in enumerate(words):
85
  weight = 0
86
-
87
- # Check for punctuation
88
  for punct, punct_weight in self.punctuation_weights.items():
89
  if word.endswith(punct):
90
  weight = max(weight, punct_weight)
91
-
92
- # Check for natural phrase boundaries
93
  phrase_starters = {'however', 'therefore', 'moreover', 'furthermore', 'meanwhile', 'although', 'because'}
94
  if i < len(words) - 1 and words[i+1].lower() in phrase_starters:
95
  weight = max(weight, 0.6)
96
-
97
- # Check for conjunctions at natural points
98
  if i > self.min_segment_words:
99
  conjunctions = {'and', 'but', 'or', 'nor', 'for', 'yet', 'so'}
100
  if word.lower() in conjunctions:
101
  weight = max(weight, 0.4)
102
-
103
  if weight > 0:
104
  breaks.append((i, weight))
105
-
106
  return breaks
107
 
108
  def split_into_segments(self, text: str) -> List[Segment]:
109
- # Normalize text and add proper spacing around punctuation
110
  text = re.sub(r'\s+', ' ', text.strip())
111
  text = re.sub(r'([.!?,;:])\s*', r'\1 ', text)
112
  text = re.sub(r'\s+([.!?,;:])', r'\1', text)
113
-
114
- # First, split into major segments by strong punctuation
115
  segments = []
116
- current_segment = []
117
- current_text = ""
118
  words = text.split()
119
 
120
  i = 0
@@ -122,7 +102,6 @@ class TextProcessor:
122
  complexity = self.analyze_sentence_complexity(' '.join(words[i:i + self.words_per_line * 2]))
123
  breaks = self.find_natural_breaks(' '.join(words[i:i + int(self.max_segment_words * complexity)]))
124
 
125
- # Find best break point
126
  best_break = None
127
  best_weight = 0
128
 
@@ -135,14 +114,10 @@ class TextProcessor:
135
  best_weight = weight
136
 
137
  if best_break is None:
138
- # If no good break found, use maximum length
139
  best_break = min(self.words_per_line * self.lines_per_segment, len(words) - i)
140
 
141
- # Create segment
142
  segment_words = words[i:i + best_break + 1]
143
  segment_text = ' '.join(segment_words)
144
-
145
- # Split segment into lines
146
  lines = self.split_into_lines(segment_text)
147
  final_segment_text = '\n'.join(lines)
148
 
@@ -152,11 +127,9 @@ class TextProcessor:
152
  ))
153
 
154
  i += best_break + 1
155
-
156
  return segments
157
 
158
  def split_into_lines(self, text: str) -> List[str]:
159
- """Split segment text into natural lines"""
160
  words = text.split()
161
  lines = []
162
  current_line = []
@@ -166,7 +139,6 @@ class TextProcessor:
166
  current_line.append(word)
167
  word_count += 1
168
 
169
- # Check for natural line breaks
170
  is_break = (
171
  word_count >= self.words_per_line or
172
  any(word.endswith(p) for p in '.!?') or
@@ -181,7 +153,6 @@ class TextProcessor:
181
 
182
  if current_line:
183
  lines.append(' '.join(current_line))
184
-
185
  return lines
186
 
187
  class TTSError(Exception):
@@ -189,10 +160,8 @@ class TTSError(Exception):
189
  pass
190
 
191
  async def process_segment_with_timing(segment: Segment, voice: str, rate: str, pitch: str) -> Segment:
192
- """Process a complete segment as a single TTS unit with improved error handling"""
193
  audio_file = os.path.join(tempfile.gettempdir(), f"temp_segment_{segment.id}_{uuid.uuid4()}.wav")
194
  try:
195
- # Process the entire segment text as one unit, replacing newlines with spaces
196
  segment_text = ' '.join(segment.text.split('\n'))
197
  tts = edge_tts.Communicate(segment_text, voice, rate=rate, pitch=pitch)
198
 
@@ -206,7 +175,6 @@ async def process_segment_with_timing(segment: Segment, voice: str, rate: str, p
206
 
207
  try:
208
  segment.audio = AudioSegment.from_file(audio_file)
209
- # Reduced silence to 30ms for more natural flow
210
  silence = AudioSegment.silent(duration=30)
211
  segment.audio = silence + segment.audio + silence
212
  segment.duration = len(segment.audio)
@@ -223,21 +191,19 @@ async def process_segment_with_timing(segment: Segment, voice: str, rate: str, p
223
  try:
224
  os.remove(audio_file)
225
  except Exception:
226
- pass # Ignore deletion errors
227
 
228
  class FileManager:
229
  """Manages temporary and output files with cleanup capabilities"""
230
  def __init__(self):
231
  self.temp_dir = tempfile.mkdtemp(prefix="tts_app_")
232
  self.output_files = []
233
- self.max_files_to_keep = 5 # Keep only the 5 most recent output pairs
234
 
235
  def get_temp_path(self, prefix):
236
- """Get a path for a temporary file"""
237
  return os.path.join(self.temp_dir, f"{prefix}_{uuid.uuid4()}")
238
 
239
  def create_output_paths(self):
240
- """Create paths for output files"""
241
  unique_id = str(uuid.uuid4())
242
  audio_path = os.path.join(self.temp_dir, f"final_audio_{unique_id}.mp3")
243
  srt_path = os.path.join(self.temp_dir, f"final_subtitles_{unique_id}.srt")
@@ -248,7 +214,6 @@ class FileManager:
248
  return srt_path, audio_path
249
 
250
  def cleanup_old_files(self):
251
- """Clean up old output files, keeping only the most recent ones"""
252
  if len(self.output_files) > self.max_files_to_keep:
253
  old_files = self.output_files[:-self.max_files_to_keep]
254
  for srt_path, audio_path in old_files:
@@ -258,13 +223,10 @@ class FileManager:
258
  if os.path.exists(audio_path):
259
  os.remove(audio_path)
260
  except Exception:
261
- pass # Ignore deletion errors
262
-
263
- # Update the list to only include files we're keeping
264
  self.output_files = self.output_files[-self.max_files_to_keep:]
265
 
266
  def cleanup_all(self):
267
- """Clean up all managed files"""
268
  for srt_path, audio_path in self.output_files:
269
  try:
270
  if os.path.exists(srt_path):
@@ -272,12 +234,11 @@ class FileManager:
272
  if os.path.exists(audio_path):
273
  os.remove(audio_path)
274
  except Exception:
275
- pass # Ignore deletion errors
276
-
277
  try:
278
  os.rmdir(self.temp_dir)
279
  except Exception:
280
- pass # Ignore if directory isn't empty or can't be removed
281
 
282
  file_manager = FileManager()
283
 
@@ -292,7 +253,6 @@ async def generate_accurate_srt(
292
  parallel: bool = True,
293
  max_workers: int = 4
294
  ) -> Tuple[str, str]:
295
- """Generate accurate SRT with parallel processing option"""
296
  processor = TextProcessor(words_per_line, lines_per_segment)
297
  segments = processor.split_into_segments(text)
298
 
@@ -396,31 +356,32 @@ async def generate_accurate_srt(
396
 
397
  return srt_path, audio_path
398
 
399
- # This function is now correctly aligned to return types expected by the UI
400
  async def process_text_with_progress(
401
  text,
402
  pitch,
403
  rate,
404
- voice,
405
  words_per_line,
406
  lines_per_segment,
407
  parallel_processing,
408
  progress=gr.Progress()
409
  ):
410
- # Initialize outputs to their default 'hidden' state by providing empty strings
411
- # and setting visible=False via gr.update.
412
- output_audio = None # gr.Audio expects None or a path
413
- output_srt_link_html = gr.update(value="", visible=False) # gr.HTML expects a string
414
- output_audio_link_html = gr.update(value="", visible=False) # gr.HTML expects a string
415
- output_error_message = gr.update(value="", visible=False) # gr.Textbox expects a string
416
-
417
  # Input validation
418
  if not text or text.strip() == "":
 
419
  return (
420
- output_audio,
421
- output_srt_link_html,
422
- output_audio_link_html,
423
- gr.update(value="Please enter some text to convert to speech.", visible=True)
424
  )
425
 
426
  pitch_str = f"{pitch:+d}Hz" if pitch != 0 else "+0Hz"
@@ -432,9 +393,10 @@ async def process_text_with_progress(
432
  def update_progress(value, status):
433
  progress(value, status)
434
 
 
435
  srt_path, audio_path = await generate_accurate_srt(
436
  text,
437
- voice_options[voice],
438
  rate_str,
439
  pitch_str,
440
  words_per_line,
@@ -443,8 +405,9 @@ async def process_text_with_progress(
443
  parallel=parallel_processing
444
  )
445
 
446
- # Create HTML strings for download links. Gradio serves files using "file=" prefix.
447
- srt_download_html = f"""
 
448
  <a href="file={srt_path}" download="subtitles.srt" target="_blank"
449
  style="display: inline-block; padding: 10px 20px; background: linear-gradient(135deg, #4776E6, #8E54E9); color: white; text-decoration: none; border-radius: 8px; font-weight: 600; transition: all 0.3s ease;"
450
  onmouseover="this.style.transform='translateY(-2px)'; this.style.boxShadow='0 5px 15px rgba(71, 118, 230, 0.3)';"
@@ -452,7 +415,7 @@ async def process_text_with_progress(
452
  Download SRT File
453
  </a>
454
  """
455
- audio_download_html = f"""
456
  <a href="file={audio_path}" download="audio.mp3" target="_blank"
457
  style="display: inline-block; padding: 10px 20px; background: linear-gradient(135deg, #4776E6, #8E54E9); color: white; text-decoration: none; border-radius: 8px; font-weight: 600; transition: all 0.3s ease;"
458
  onmouseover="this.style.transform='translateY(-2px)'; this.style.boxShadow='0 5px 15px rgba(71, 118, 230, 0.3)';"
@@ -460,89 +423,48 @@ async def process_text_with_progress(
460
  Download Audio File
461
  </a>
462
  """
 
 
 
463
 
 
464
  return (
465
- audio_path, # Path for gr.Audio preview
466
- gr.update(value=srt_download_html, visible=True), # HTML link for SRT download
467
- gr.update(value=audio_download_html, visible=True), # HTML link for Audio download
468
- gr.update(value="", visible=False) # Hide error message
469
  )
470
  except TTSError as e:
471
- error_message = f"TTS Error: {str(e)}"
472
  except Exception as e:
473
- error_message = f"Unexpected error: {str(e)}"
474
 
 
475
  return (
476
- None, # Clear audio output on error
477
- gr.update(value="", visible=False), # Hide SRT download link
478
- gr.update(value="", visible=False), # Hide Audio download link
479
- gr.update(value=error_message, visible=True) # Show error message
480
  )
481
 
482
- # This function is not used in the final version of the code, but kept for context from your example.
483
- def create_download_link(audio_path):
484
- if audio_path is None:
485
- return None
486
-
487
- filename = Path(audio_path).name
488
- # Gradio handles file serving with "file=" prefix directly, no need for base_url
489
- file_url = f"file={audio_path}"
490
-
491
- return f"""
492
- <a href="{file_url}"
493
- download="{filename}"
494
- target="_blank"
495
- rel="noopener noreferrer"
496
- style="display: inline-block; padding: 10px 20px; background: linear-gradient(135deg, #4776E6, #8E54E9); color: white; text-decoration: none; border-radius: 8px; font-weight: 600; transition: all 0.3s ease;"
497
- onmouseover="this.style.transform='translateY(-2px)'; this.style.boxShadow='0 5px 15px rgba(71, 118, 230, 0.3)';"
498
- onmouseout="this.style.transform='translateY(0)'; this.style.boxShadow='none';"
499
- onclick="event.preventDefault(); fetch(this.href).then(resp => resp.blob()).then(blob => {{
500
- const url = window.URL.createObjectURL(blob);
501
- const a = document.createElement('a');
502
- a.style.display = 'none';
503
- a.href = url;
504
- a.download = '{filename}';
505
- document.body.appendChild(a);
506
- a.click();
507
- window.URL.revokeObjectURL(url);
508
- document.body.removeChild(a);
509
- }});">
510
- Download Audio File
511
- </a>
512
- """
513
-
514
- def cleanup_file(filepath, delay=300):
515
- def delete_file():
516
- try:
517
- if os.path.exists(filepath):
518
- os.remove(filepath)
519
- print(f"Cleaned up file: {filepath}")
520
- except Exception as e:
521
- print(f"Error cleaning up file {filepath}: {e}")
522
-
523
- Timer(delay, delete_file).start()
524
-
525
- # --- Voice Options and Gradio Interface ---
526
- language_dict = {
527
- "Hindi": {
528
- "Madhur": "hi-IN-MadhurNeural",
529
- "Swara": "hi-IN-SwaraNeural"
530
- },
531
- "English": {
532
- "Jenny": "en-US-JennyNeural",
533
- "Guy": "en-US-GuyNeural",
534
- "Ana": "en-US-AnaNeural",
535
- "Aria": "en-US-AriaNeural",
536
- "Brian": "en-US-BrianNeural",
537
- "Christopher": "en-US-ChristopherNeural",
538
- "Eric": "en-US-EricNeural",
539
- "Michelle": "en-US-MichelleNeural",
540
- "Roger": "en-US-RogerNeural",
541
- "Natasha": "en-AU-NatashaNeural",
542
- "William": "en-AU-WilliamNeural",
543
- "Clara": "en-CA-ClaraNeural",
544
- "Liam": "en-CA-LiamNeural",
545
- "Libby": "en-GB-LibbyNeural",
546
  "Maisie": "en-GB-MaisieNeural",
547
  "Ryan": "en-GB-RyanNeural",
548
  "Sonia": "en-GB-SoniaNeural",
@@ -565,414 +487,56 @@ language_dict = {
565
  "Elimu": "en-TZ-ElimuNeural",
566
  "Imani": "en-TZ-ImaniNeural",
567
  "Leah": "en-ZA-LeahNeural",
568
- "Luke": "en-ZA-LukeNeural"
569
- },
570
- "Spanish": {
571
- "Elena": "es-AR-ElenaNeural",
572
  "Tomas": "es-AR-TomasNeural",
573
- "Marcelo": "es-BO-MarceloNeural",
574
- "Sofia": "es-BO-SofiaNeural",
575
- "Gonzalo": "es-CO-GonzaloNeural",
576
- "Salome": "es-CO-SalomeNeural",
577
- "Juan": "es-CR-JuanNeural",
578
- "Maria": "es-CR-MariaNeural",
579
- "Belkys": "es-CU-BelkysNeural",
580
- "Emilio": "es-DO-EmilioNeural",
581
- "Ramona": "es-DO-RamonaNeural",
582
- "Andrea": "es-EC-AndreaNeural",
583
- "Luis": "es-EC-LuisNeural",
584
- "Alvaro": "es-ES-AlvaroNeural",
585
- "Elvira": "es-ES-ElviraNeural",
586
- "Teresa": "es-GQ-TeresaNeural",
587
- "Andres": "es-GT-AndresNeural",
588
- "Marta": "es-GT-MartaNeural",
589
- "Carlos": "es-HN-CarlosNeural",
590
- "Karla": "es-HN-KarlaNeural",
591
- "Federico": "es-NI-FedericoNeural",
592
- "Yolanda": "es-NI-YolandaNeural",
593
- "Margarita": "es-PA-MargaritaNeural",
594
- "Roberto": "es-PA-RobertoNeural",
595
- "Alex": "es-PE-AlexNeural",
596
- "Camila": "es-PE-CamilaNeural",
597
- "Karina": "es-PR-KarinaNeural",
598
- "Victor": "es-PR-VictorNeural",
599
- "Mario": "es-PY-MarioNeural",
600
- "Tania": "es-PY-TaniaNeural",
601
- "Lorena": "es-SV-LorenaNeural",
602
- "Rodrigo": "es-SV-RodrigoNeural",
603
- "Alonso": "es-US-AlonsoNeural",
604
- "Paloma": "es-US-PalomaNeural",
605
- "Mateo": "es-UY-MateoNeural",
606
- "Valentina": "es-UY-ValentinaNeural",
607
- "Paola": "es-VE-PaolaNeural",
608
- "Sebastian": "es-VE-SebastianNeural"
609
- },
610
- "Arabic": {
611
- "Hamed": "ar-SA-HamedNeural",
612
- "Zariyah": "ar-SA-ZariyahNeural",
613
- "Fatima": "ar-AE-FatimaNeural",
614
- "Hamdan": "ar-AE-HamdanNeural",
615
- "Ali": "ar-BH-AliNeural",
616
- "Laila": "ar-BH-LailaNeural",
617
- "Ismael": "ar-DZ-IsmaelNeural",
618
- "Salma": "ar-EG-SalmaNeural",
619
- "Shakir": "ar-EG-ShakirNeural",
620
- "Bassel": "ar-IQ-BasselNeural",
621
- "Rana": "ar-IQ-RanaNeural",
622
- "Sana": "ar-JO-SanaNeural",
623
- "Taim": "ar-JO-TaimNeural",
624
- "Fahed": "ar-KW-FahedNeural",
625
- "Noura": "ar-KW-NouraNeural",
626
- "Layla": "ar-LB-LaylaNeural",
627
- "Rami": "ar-LB-RamiNeural",
628
- "Iman": "ar-LY-ImanNeural",
629
- "Omar": "ar-LY-OmarNeural",
630
- "Jamal": "ar-MA-JamalNeural",
631
- "Mouna": "ar-MA-MounaNeural",
632
- "Abdullah": "ar-OM-AbdullahNeural",
633
- "Aysha": "ar-OM-AyshaNeural",
634
- "Amal": "ar-QA-AmalNeural",
635
- "Moaz": "ar-QA-MoazNeural",
636
- "Amany": "ar-SY-AmanyNeural",
637
- "Laith": "ar-SY-LaithNeural",
638
- "Hedi": "ar-TN-HediNeural",
639
- "Reem": "ar-TN-ReemNeural",
640
- "Maryam": "ar-YE-MaryamNeural",
641
- "Saleh": "ar-YE-SalehNeural"
642
- },
643
- "Korean": {
644
- "Sun-Hi": "ko-KR-SunHiNeural",
645
- "InJoon": "ko-KR-InJoonNeural"
646
- },
647
- "Thai": {
648
- "Premwadee": "th-TH-PremwadeeNeural",
649
- "Niwat": "th-TH-NiwatNeural"
650
- },
651
- "Vietnamese": {
652
- "HoaiMy": "vi-VN-HoaiMyNeural",
653
- "NamMinh": "vi-VN-NamMinhNeural"
654
- },
655
- "Japanese": {
656
- "Nanami": "ja-JP-NanamiNeural",
657
- "Keita": "ja-JP-KeitaNeural"
658
- },
659
- "French": {
660
- "Denise": "fr-FR-DeniseNeural",
661
- "Eloise": "fr-FR-EloiseNeural",
662
- "Henri": "fr-FR-HenriNeural",
663
- "Sylvie": "fr-CA-SylvieNeural",
664
- "Antoine": "fr-CA-AntoineNeural",
665
- "Jean": "fr-CA-JeanNeural",
666
- "Ariane": "fr-CH-ArianeNeural",
667
- "Fabrice": "fr-CH-FabriceNeural",
668
- "Charline": "fr-BE-CharlineNeural",
669
- "Gerard": "fr-BE-GerardNeural"
670
- },
671
- "Portuguese": {
672
- "Francisca": "pt-BR-FranciscaNeural",
673
- "Antonio": "pt-BR-AntonioNeural",
674
- "Duarte": "pt-PT-DuarteNeural",
675
- "Raquel": "pt-PT-RaquelNeural"
676
- },
677
- "Indonesian": {
678
- "Ardi": "id-ID-ArdiNeural",
679
- "Gadis": "id-ID-GadisNeural"
680
- },
681
- "Hebrew": {
682
- "Avri": "he-IL-AvriNeural",
683
- "Hila": "he-IL-HilaNeural"
684
- },
685
- "Italian": {
686
- "Isabella": "it-IT-IsabellaNeural",
687
- "Diego": "it-IT-DiegoNeural",
688
- "Elsa": "it-IT-ElsaNeural"
689
- },
690
- "Dutch": {
691
- "Colette": "nl-NL-ColetteNeural",
692
- "Fenna": "nl-NL-FennaNeural",
693
- "Maarten": "nl-NL-MaartenNeural",
694
- "Arnaud": "nl-BE-ArnaudNeural",
695
- "Dena": "nl-BE-DenaNeural"
696
- },
697
- "Malay": {
698
- "Osman": "ms-MY-OsmanNeural",
699
- "Yasmin": "ms-MY-YasminNeural"
700
- },
701
- "Norwegian": {
702
- "Pernille": "nb-NO-PernilleNeural",
703
- "Finn": "nb-NO-FinnNeural"
704
- },
705
- "Swedish": {
706
- "Sofie": "sv-SE-SofieNeural",
707
- "Mattias": "sv-SE-MattiasNeural"
708
- },
709
- "Greek": {
710
- "Athina": "el-GR-AthinaNeural",
711
- "Nestoras": "el-GR-NestorasNeural"
712
- },
713
- "German": {
714
- "Katja": "de-DE-KatjaNeural",
715
- "Amala": "de-DE-AmalaNeural",
716
- "Conrad": "de-DE-ConradNeural",
717
- "Killian": "de-DE-KillianNeural",
718
- "Ingrid": "de-AT-IngridNeural",
719
- "Jonas": "de-AT-JonasNeural",
720
- "Jan": "de-CH-JanNeural",
721
- "Leni": "de-CH-LeniNeural"
722
- },
723
- "Afrikaans": {
724
- "Adri": "af-ZA-AdriNeural",
725
- "Willem": "af-ZA-WillemNeural"
726
- },
727
- "Amharic": {
728
- "Ameha": "am-ET-AmehaNeural",
729
- "Mekdes": "am-ET-MekdesNeural"
730
- },
731
- "Azerbaijani": {
732
- "Babek": "az-AZ-BabekNeural",
733
- "Banu": "az-AZ-BanuNeural"
734
- },
735
- "Bulgarian": {
736
- "Borislav": "bg-BG-BorislavNeural",
737
- "Kalina": "bg-BG-KalinaNeural"
738
- },
739
- "Bengali": {
740
- "Nabanita": "bn-BD-NabanitaNeural",
741
- "Pradeep": "bn-BD-PradeepNeural",
742
- "Bashkar": "bn-IN-BashkarNeural",
743
- "Tanishaa": "bn-IN-TanishaaNeural"
744
- },
745
- "Bosnian": {
746
- "Goran": "bs-BA-GoranNeural",
747
- "Vesna": "bs-BA-VesnaNeural"
748
- },
749
- "Catalan": {
750
- "Joana": "ca-ES-JoanaNeural",
751
- "Enric": "ca-ES-EnricNeural"
752
- },
753
- "Czech": {
754
- "Antonin": "cs-CZ-AntoninNeural",
755
- "Vlasta": "cs-CZ-VlastaNeural"
756
- },
757
- "Welsh": {
758
- "Aled": "cy-GB-AledNeural",
759
- "Nia": "cy-GB-NiaNeural"
760
- },
761
- "Danish": {
762
- "Christel": "da-DK-ChristelNeural",
763
- "Jeppe": "da-DK-JeppeNeural"
764
- },
765
- "Estonian": {
766
- "Anu": "et-EE-AnuNeural",
767
- "Kert": "et-EE-KertNeural"
768
- },
769
- "Persian": {
770
- "Dilara": "fa-IR-DilaraNeural",
771
- "Farid": "fa-IR-FaridNeural"
772
- },
773
- "Finnish": {
774
- "Harri": "fi-FI-HarriNeural",
775
- "Noora": "fi-FI-NooraNeural"
776
- },
777
- "Irish": {
778
- "Colm": "ga-IE-ColmNeural",
779
- "Orla": "ga-IE-OrlaNeural"
780
- },
781
- "Galician": {
782
- "Roi": "gl-ES-RoiNeural",
783
- "Sabela": "gl-ES-SabelaNeural"
784
- },
785
- "Gujarati": {
786
- "Dhwani": "gu-IN-DhwaniNeural",
787
- "Niranjan": "gu-IN-NiranjanNeural"
788
- },
789
- "Croatian": {
790
- "Gabrijela": "hr-HR-GabrijelaNeural",
791
- "Srecko": "hr-HR-SreckoNeural"
792
- },
793
- "Hungarian": {
794
- "Noemi": "hu-HU-NoemiNeural",
795
- "Tamas": "hu-HU-TamasNeural"
796
- },
797
- "Icelandic": {
798
- "Gudrun": "is-IS-GudrunNeural",
799
- "Gunnar": "is-IS-GunnarNeural"
800
- },
801
- "Javanese": {
802
- "Dimas": "jv-ID-DimasNeural",
803
- "Siti": "jv-ID-SitiNeural"
804
- },
805
- "Georgian": {
806
- "Eka": "ka-GE-EkaNeural",
807
- "Giorgi": "ka-GE-GiorgiNeural"
808
- },
809
- "Kazakh": {
810
- "Aigul": "kk-KZ-AigulNeural",
811
- "Daulet": "kk-KZ-DauletNeural"
812
- },
813
- "Khmer": {
814
- "Piseth": "km-KH-PisethNeural",
815
- "Sreymom": "km-KH-SreymomNeural"
816
- },
817
- "Kannada": {
818
- "Gagan": "kn-IN-GaganNeural",
819
- "Sapna": "kn-IN-SapnaNeural"
820
- },
821
- "Lao": {
822
- "Chanthavong": "lo-LA-ChanthavongNeural",
823
- "Keomany": "lo-LA-KeomanyNeural"
824
- },
825
- "Lithuanian": {
826
- "Leonas": "lt-LT-LeonasNeural",
827
- "Ona": "lt-LT-OnaNeural"
828
- },
829
- "Latvian": {
830
- "Everita": "lv-LV-EveritaNeural",
831
- "Nils": "lv-LV-NilsNeural"
832
- },
833
- "Macedonian": {
834
- "Aleksandar": "mk-MK-AleksandarNeural",
835
- "Marija": "mk-MK-MarijaNeural"
836
- },
837
- "Malayalam": {
838
- "Midhun": "ml-IN-MidhunNeural",
839
- "Sobhana": "ml-IN-SobhanaNeural"
840
- },
841
- "Mongolian": {
842
- "Bataa": "mn-MN-BataaNeural",
843
- "Yesui": "mn-MN-YesuiNeural"
844
- },
845
- "Marathi": {
846
- "Aarohi": "mr-IN-AarohiNeural",
847
- "Manohar": "mr-IN-ManoharNeural"
848
- },
849
- "Maltese": {
850
- "Grace": "mt-MT-GraceNeural",
851
- "Joseph": "mt-MT-JosephNeural"
852
- },
853
- "Burmese": {
854
- "Nilar": "my-MM-NilarNeural",
855
- "Thiha": "my-MM-ThihaNeural"
856
- },
857
- "Nepali": {
858
- "Hemkala": "ne-NP-HemkalaNeural",
859
- "Sagar": "ne-NP-SagarNeural"
860
- },
861
- "Polish": {
862
- "Marek": "pl-PL-MarekNeural",
863
- "Zofia": "pl-PL-ZofiaNeural"
864
- },
865
- "Pashto": {
866
- "Gul Nawaz": "ps-AF-GulNawazNeural",
867
- "Latifa": "ps-AF-LatifaNeural"
868
- },
869
- "Romanian": {
870
- "Alina": "ro-RO-AlinaNeural",
871
- "Emil": "ro-RO-EmilNeural"
872
- },
873
- "Russian": {
874
- "Svetlana": "ru-RU-SvetlanaNeural",
875
- "Dmitry": "ru-RU-DmitryNeural"
876
- },
877
- "Sinhala": {
878
- "Sameera": "si-LK-SameeraNeural",
879
- "Thilini": "si-LK-ThiliniNeural"
880
- },
881
- "Slovak": {
882
- "Lukas": "sk-SK-LukasNeural",
883
- "Viktoria": "sk-SK-ViktoriaNeural"
884
- },
885
- "Slovenian": {
886
- "Petra": "sl-SI-PetraNeural",
887
- "Rok": "sl-SI-RokNeural"
888
- },
889
- "Somali": {
890
- "Muuse": "so-SO-MuuseNeural",
891
- "Ubax": "so-SO-UbaxNeural"
892
- },
893
- "Albanian": {
894
- "Anila": "sq-AL-AnilaNeural",
895
- "Ilir": "sq-AL-IlirNeural"
896
- },
897
- "Serbian": {
898
- "Nicholas": "sr-RS-NicholasNeural",
899
- "Sophie": "sr-RS-SophieNeural"
900
- },
901
- "Sundanese": {
902
- "Jajang": "su-ID-JajangNeural",
903
- "Tuti": "su-ID-TutiNeural"
904
- },
905
- "Swahili": {
906
- "Rafiki": "sw-KE-RafikiNeural",
907
- "Zuri": "sw-KE-ZuriNeural",
908
- "Daudi": "sw-TZ-DaudiNeural",
909
- "Rehema": "sw-TZ-RehemaNeural"
910
  },
911
- "Tamil": {
912
- "Pallavi": "ta-IN-PallaviNeural",
913
- "Valluvar": "ta-IN-ValluvarNeural",
914
- "Kumar": "ta-LK-KumarNeural",
915
- "Saranya": "ta-LK-SaranyaNeural",
916
- "Kani": "ta-MY-KaniNeural",
917
- "Surya": "ta-MY-SuryaNeural",
918
- "Anbu": "ta-SG-AnbuNeural"
919
- },
920
- "Telugu": {
921
- "Mohan": "te-IN-MohanNeural",
922
- "Shruti": "te-IN-ShrutiNeural"
923
- },
924
- "Turkish": {
925
- "Ahmet": "tr-TR-AhmetNeural",
926
- "Emel": "tr-TR-EmelNeural"
927
- },
928
- "Ukrainian": {
929
- "Ostap": "uk-UA-OstapNeural",
930
- "Polina": "uk-UA-PolinaNeural"
931
- },
932
- "Urdu": {
933
- "Gul": "ur-IN-GulNeural",
934
- "Salman": "ur-IN-SalmanNeural",
935
- "Asad": "ur-PK-AsadNeural",
936
- "Uzma": "ur-PK-UzmaNeural"
937
- },
938
- "Uzbek": {
939
- "Madina": "uz-UZ-MadinaNeural",
940
- "Sardor": "uz-UZ-SardorNeural"
941
- },
942
- "Mandarin": {
943
- "Xiaoxiao": "zh-CN-XiaoxiaoNeural",
944
- "Yunyang": "zh-CN-YunyangNeural",
945
- "Yunxi": "zh-CN-YunxiNeural",
946
- "Xiaoyi": "zh-CN-XiaoyiNeural",
947
- "Yunjian": "zh-CN-YunjianNeural",
948
- "Yunxia": "zh-CN-YunxiaNeural",
949
- "Xiaobei": "zh-CN-liaoning-XiaobeiNeural",
950
- "Xiaoni": "zh-CN-shaanxi-XiaoniNeural",
951
- "HiuMaan": "zh-HK-HiuMaanNeural",
952
- "HiuGaai": "zh-HK-HiuGaaiNeural",
953
- "WanLung": "zh-HK-WanLungNeural",
954
- "HsiaoChen": "zh-TW-HsiaoChenNeural",
955
- "HsiaoYu": "zh-TW-HsiaoYuNeural",
956
- "YunJhe": "zh-TW-YunJheNeural"
957
- },
958
- "Zulu": {
959
- "Thando": "zu-ZA-ThandoNeural",
960
- "Themba": "zu-ZA-ThembaNeural"
961
- }
962
  }
963
 
964
- # Ensure these have initial values, even if temporary
965
- default_language = "English"
966
- default_speaker = language_dict[default_language][list(language_dict[default_language].keys())[0]] # Set to first English speaker
967
 
968
- def get_speakers(language):
 
 
 
 
 
 
 
 
 
969
  speakers = list(language_dict[language].keys())
970
  # Return gr.update to set choices and selected value
971
- return gr.update(choices=speakers, value=speakers[0], interactive=True), gr.Checkbox(visible=language == "Arabic", interactive=True)
 
972
 
973
  atexit.register(file_manager.cleanup_all)
974
 
975
- # Create Gradio interface
976
  with gr.Blocks(title="Advanced TTS with Configurable SRT Generation",
977
  css="""
978
  :root {
@@ -1104,7 +668,7 @@ with gr.Blocks(title="Advanced TTS with Configurable SRT Generation",
1104
  display: none !important;
1105
  }
1106
  """
1107
- ) as app: # Changed demo to app for consistency
1108
  gr.Markdown("# Advanced TTS with Configurable SRT Generation")
1109
  gr.Markdown("Generate perfectly synchronized audio and subtitles with natural speech patterns.")
1110
 
@@ -1113,17 +677,19 @@ with gr.Blocks(title="Advanced TTS with Configurable SRT Generation",
1113
  text_input = gr.Textbox(label="Enter Text", lines=10, placeholder="Enter your text here...")
1114
 
1115
  with gr.Column(scale=2):
1116
- language = gr.Dropdown( # Changed to language for consistency
 
1117
  label="Select Language",
1118
  choices=list(language_dict.keys()),
1119
  value=default_language,
1120
  interactive=True
1121
  )
1122
- speaker = gr.Dropdown( # Changed to speaker for consistency
 
1123
  label="Select Voice",
1124
- choices=list(language_dict[default_language].keys()), # Initialize with default language's speakers
1125
- value=list(language_dict[default_language].keys())[0], # Default to first speaker of default language
1126
- interactive=True # Should be interactive if it changes based on language
1127
  )
1128
  pitch_slider = gr.Slider(
1129
  label="Pitch Adjustment (Hz)",
@@ -1165,10 +731,11 @@ with gr.Blocks(title="Advanced TTS with Configurable SRT Generation",
1165
  value=True,
1166
  info="Process multiple segments simultaneously for faster conversion (recommended for longer texts)"
1167
  )
1168
- tashkeel_checkbox = gr.Checkbox( # Moved here for better layout
 
1169
  label="Tashkeel (Arabic Only)",
1170
  value=False,
1171
- visible=False, # Initially hidden
1172
  interactive=True
1173
  )
1174
 
@@ -1178,17 +745,17 @@ with gr.Blocks(title="Advanced TTS with Configurable SRT Generation",
1178
 
1179
  with gr.Row():
1180
  with gr.Column():
1181
- audio_output = gr.Audio(label="Preview Audio")
1182
  with gr.Column():
1183
- # Use gr.HTML for download links
1184
- srt_download_link = gr.HTML(value="", visible=False, label="Download SRT")
1185
- audio_download_link = gr.HTML(value="", visible=False, label="Download Audio")
1186
-
1187
  # Event Handlers
1188
- language.change(
1189
- fn=get_speakers,
1190
- inputs=[language],
1191
- outputs=[speaker, tashkeel_checkbox] # Ensure correct output for dropdown and checkbox
1192
  )
1193
 
1194
  submit_btn.click(
@@ -1197,15 +764,15 @@ with gr.Blocks(title="Advanced TTS with Configurable SRT Generation",
1197
  text_input,
1198
  pitch_slider,
1199
  rate_slider,
1200
- speaker, # Use 'speaker' here as it holds the actual voice code
1201
  words_per_line,
1202
  lines_per_segment,
1203
  parallel_processing
1204
  ],
1205
  outputs=[
1206
- audio_output,
1207
- srt_download_link,
1208
- audio_download_link,
1209
  error_output
1210
  ],
1211
  api_name="generate"
 
11
  from typing import List, Tuple, Optional, Dict, Any
12
  import math
13
  from dataclasses import dataclass
14
+ from pathlib import Path # Import Path for cleaner file handling
15
 
16
  class TimingManager:
17
  def __init__(self):
 
42
  end_time: int = 0
43
  duration: int = 0
44
  audio: Optional[AudioSegment] = None
45
+ lines: List[str] = None
46
 
47
  class TextProcessor:
48
  def __init__(self, words_per_line: int, lines_per_segment: int):
49
  self.words_per_line = words_per_line
50
  self.lines_per_segment = lines_per_segment
51
  self.min_segment_words = 3
52
+ self.max_segment_words = words_per_line * lines_per_segment * 1.5
53
  self.punctuation_weights = {
54
+ '.': 1.0,
55
  '!': 1.0,
56
  '?': 1.0,
57
+ ';': 0.8,
58
  ':': 0.7,
59
+ ',': 0.5,
60
+ '-': 0.3,
61
  '(': 0.2,
62
  ')': 0.2
63
  }
64
 
65
  def analyze_sentence_complexity(self, text: str) -> float:
 
66
  words = text.split()
67
  complexity = 1.0
 
 
68
  if len(words) > self.words_per_line * 2:
69
  complexity *= 1.2
 
 
70
  punct_count = sum(text.count(p) for p in self.punctuation_weights.keys())
71
  complexity *= (1 + (punct_count / len(words)) * 0.5)
 
72
  return complexity
73
 
74
  def find_natural_breaks(self, text: str) -> List[Tuple[int, float]]:
 
75
  breaks = []
76
  words = text.split()
 
77
  for i, word in enumerate(words):
78
  weight = 0
 
 
79
  for punct, punct_weight in self.punctuation_weights.items():
80
  if word.endswith(punct):
81
  weight = max(weight, punct_weight)
 
 
82
  phrase_starters = {'however', 'therefore', 'moreover', 'furthermore', 'meanwhile', 'although', 'because'}
83
  if i < len(words) - 1 and words[i+1].lower() in phrase_starters:
84
  weight = max(weight, 0.6)
 
 
85
  if i > self.min_segment_words:
86
  conjunctions = {'and', 'but', 'or', 'nor', 'for', 'yet', 'so'}
87
  if word.lower() in conjunctions:
88
  weight = max(weight, 0.4)
 
89
  if weight > 0:
90
  breaks.append((i, weight))
 
91
  return breaks
92
 
93
  def split_into_segments(self, text: str) -> List[Segment]:
 
94
  text = re.sub(r'\s+', ' ', text.strip())
95
  text = re.sub(r'([.!?,;:])\s*', r'\1 ', text)
96
  text = re.sub(r'\s+([.!?,;:])', r'\1', text)
 
 
97
  segments = []
 
 
98
  words = text.split()
99
 
100
  i = 0
 
102
  complexity = self.analyze_sentence_complexity(' '.join(words[i:i + self.words_per_line * 2]))
103
  breaks = self.find_natural_breaks(' '.join(words[i:i + int(self.max_segment_words * complexity)]))
104
 
 
105
  best_break = None
106
  best_weight = 0
107
 
 
114
  best_weight = weight
115
 
116
  if best_break is None:
 
117
  best_break = min(self.words_per_line * self.lines_per_segment, len(words) - i)
118
 
 
119
  segment_words = words[i:i + best_break + 1]
120
  segment_text = ' '.join(segment_words)
 
 
121
  lines = self.split_into_lines(segment_text)
122
  final_segment_text = '\n'.join(lines)
123
 
 
127
  ))
128
 
129
  i += best_break + 1
 
130
  return segments
131
 
132
  def split_into_lines(self, text: str) -> List[str]:
 
133
  words = text.split()
134
  lines = []
135
  current_line = []
 
139
  current_line.append(word)
140
  word_count += 1
141
 
 
142
  is_break = (
143
  word_count >= self.words_per_line or
144
  any(word.endswith(p) for p in '.!?') or
 
153
 
154
  if current_line:
155
  lines.append(' '.join(current_line))
 
156
  return lines
157
 
158
  class TTSError(Exception):
 
160
  pass
161
 
162
  async def process_segment_with_timing(segment: Segment, voice: str, rate: str, pitch: str) -> Segment:
 
163
  audio_file = os.path.join(tempfile.gettempdir(), f"temp_segment_{segment.id}_{uuid.uuid4()}.wav")
164
  try:
 
165
  segment_text = ' '.join(segment.text.split('\n'))
166
  tts = edge_tts.Communicate(segment_text, voice, rate=rate, pitch=pitch)
167
 
 
175
 
176
  try:
177
  segment.audio = AudioSegment.from_file(audio_file)
 
178
  silence = AudioSegment.silent(duration=30)
179
  segment.audio = silence + segment.audio + silence
180
  segment.duration = len(segment.audio)
 
191
  try:
192
  os.remove(audio_file)
193
  except Exception:
194
+ pass
195
 
196
  class FileManager:
197
  """Manages temporary and output files with cleanup capabilities"""
198
  def __init__(self):
199
  self.temp_dir = tempfile.mkdtemp(prefix="tts_app_")
200
  self.output_files = []
201
+ self.max_files_to_keep = 5
202
 
203
  def get_temp_path(self, prefix):
 
204
  return os.path.join(self.temp_dir, f"{prefix}_{uuid.uuid4()}")
205
 
206
  def create_output_paths(self):
 
207
  unique_id = str(uuid.uuid4())
208
  audio_path = os.path.join(self.temp_dir, f"final_audio_{unique_id}.mp3")
209
  srt_path = os.path.join(self.temp_dir, f"final_subtitles_{unique_id}.srt")
 
214
  return srt_path, audio_path
215
 
216
  def cleanup_old_files(self):
 
217
  if len(self.output_files) > self.max_files_to_keep:
218
  old_files = self.output_files[:-self.max_files_to_keep]
219
  for srt_path, audio_path in old_files:
 
223
  if os.path.exists(audio_path):
224
  os.remove(audio_path)
225
  except Exception:
226
+ pass
 
 
227
  self.output_files = self.output_files[-self.max_files_to_keep:]
228
 
229
  def cleanup_all(self):
 
230
  for srt_path, audio_path in self.output_files:
231
  try:
232
  if os.path.exists(srt_path):
 
234
  if os.path.exists(audio_path):
235
  os.remove(audio_path)
236
  except Exception:
237
+ pass
 
238
  try:
239
  os.rmdir(self.temp_dir)
240
  except Exception:
241
+ pass
242
 
243
  file_manager = FileManager()
244
 
 
253
  parallel: bool = True,
254
  max_workers: int = 4
255
  ) -> Tuple[str, str]:
 
256
  processor = TextProcessor(words_per_line, lines_per_segment)
257
  segments = processor.split_into_segments(text)
258
 
 
356
 
357
  return srt_path, audio_path
358
 
359
+
360
  async def process_text_with_progress(
361
  text,
362
  pitch,
363
  rate,
364
+ voice, # This is the actual voice string from the dropdown
365
  words_per_line,
366
  lines_per_segment,
367
  parallel_processing,
368
  progress=gr.Progress()
369
  ):
370
+ # Initialize all outputs to their 'cleared' or 'hidden' state
371
+ # This is crucial for consistency and to avoid the TypeError.
372
+ audio_output_path = None
373
+ srt_link_html = ""
374
+ audio_link_html = ""
375
+ status_message = ""
376
+
377
  # Input validation
378
  if not text or text.strip() == "":
379
+ status_message = "Please enter some text to convert to speech."
380
  return (
381
+ audio_output_path,
382
+ gr.update(value=srt_link_html, visible=False),
383
+ gr.update(value=audio_link_html, visible=False),
384
+ gr.update(value=status_message, visible=True)
385
  )
386
 
387
  pitch_str = f"{pitch:+d}Hz" if pitch != 0 else "+0Hz"
 
393
  def update_progress(value, status):
394
  progress(value, status)
395
 
396
+ # Pass the actual voice string (e.g., "en-US-JennyNeural")
397
  srt_path, audio_path = await generate_accurate_srt(
398
  text,
399
+ voice, # Use 'voice' directly here
400
  rate_str,
401
  pitch_str,
402
  words_per_line,
 
405
  parallel=parallel_processing
406
  )
407
 
408
+ # Construct download links using Gradio's file serving prefix and target="_blank"
409
+ # The 'file=' prefix is what tells Gradio to serve the local temp file.
410
+ srt_link_html = f"""
411
  <a href="file={srt_path}" download="subtitles.srt" target="_blank"
412
  style="display: inline-block; padding: 10px 20px; background: linear-gradient(135deg, #4776E6, #8E54E9); color: white; text-decoration: none; border-radius: 8px; font-weight: 600; transition: all 0.3s ease;"
413
  onmouseover="this.style.transform='translateY(-2px)'; this.style.boxShadow='0 5px 15px rgba(71, 118, 230, 0.3)';"
 
415
  Download SRT File
416
  </a>
417
  """
418
+ audio_link_html = f"""
419
  <a href="file={audio_path}" download="audio.mp3" target="_blank"
420
  style="display: inline-block; padding: 10px 20px; background: linear-gradient(135deg, #4776E6, #8E54E9); color: white; text-decoration: none; border-radius: 8px; font-weight: 600; transition: all 0.3s ease;"
421
  onmouseover="this.style.transform='translateY(-2px)'; this.style.boxShadow='0 5px 15px rgba(71, 118, 230, 0.3)';"
 
423
  Download Audio File
424
  </a>
425
  """
426
+
427
+ audio_output_path = audio_path # Path for the gr.Audio preview
428
+ status_message = "Complete!"
429
 
430
+ # Return the updates. All outputs must be present in the tuple.
431
  return (
432
+ audio_output_path, # gr.Audio expects a path or None
433
+ gr.update(value=srt_link_html, visible=True), # gr.HTML expects a string, set visible True
434
+ gr.update(value=audio_link_html, visible=True), # gr.HTML expects a string, set visible True
435
+ gr.update(value=status_message, visible=True) # Update status message
436
  )
437
  except TTSError as e:
438
+ status_message = f"TTS Error: {str(e)}"
439
  except Exception as e:
440
+ status_message = f"Unexpected error: {str(e)}"
441
 
442
+ # Unified error return. Ensure all outputs are handled.
443
  return (
444
+ None, # Clear audio output
445
+ gr.update(value="", visible=False), # Hide SRT link
446
+ gr.update(value="", visible=False), # Hide Audio link
447
+ gr.update(value=status_message, visible=True) # Show error message
448
  )
449
 
450
+ # --- Voice Options and Gradio Interface (from your shared code) ---
451
+ voice_options = {
452
+ # Consolidated all voices under a single dictionary for direct lookup by `speaker` name
453
+ "Andrew Male": "en-US-AndrewNeural",
454
+ "Jenny Female": "en-US-JennyNeural",
455
+ "Guy Male": "en-US-GuyNeural",
456
+ "Ana Female": "en-US-AnaNeural",
457
+ "Aria Female": "en-US-AriaNeural",
458
+ "Brian Male": "en-US-BrianNeural",
459
+ "Christopher Male": "en-US-ChristopherNeural",
460
+ "Eric Male": "en-US-EricNeural",
461
+ "Michelle Male": "en-US-MichelleNeural",
462
+ "Roger Male": "en-US-RogerNeural",
463
+ "Natasha Female": "en-AU-NatashaNeural",
464
+ "William Male": "en-AU-WilliamNeural",
465
+ "Clara Female": "en-CA-ClaraNeural",
466
+ "Liam Female ": "en-CA-LiamNeural",
467
+ "Libby Female": "en-GB-LibbyNeural",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
468
  "Maisie": "en-GB-MaisieNeural",
469
  "Ryan": "en-GB-RyanNeural",
470
  "Sonia": "en-GB-SoniaNeural",
 
487
  "Elimu": "en-TZ-ElimuNeural",
488
  "Imani": "en-TZ-ImaniNeural",
489
  "Leah": "en-ZA-LeahNeural",
490
+ "Luke": "en-ZA-LukeNeural",
491
+ "Madhur": "hi-IN-MadhurNeural", # Added Hindi voices
492
+ "Swara": "hi-IN-SwaraNeural",
493
+ "Elena": "es-AR-ElenaNeural", # Spanish
494
  "Tomas": "es-AR-TomasNeural",
495
+ # ... (all other voices from your original language_dict need to be flattened here)
496
+ # FOR BREVITY, I AM NOT COPYING ALL VOICE OPTIONS HERE.
497
+ # YOU MUST FLATTEN YOUR `language_dict` INTO THIS `voice_options` DICTIONARY.
498
+ # EXAMPLE:
499
+ # "Hamed": "ar-SA-HamedNeural",
500
+ # "Sun-Hi": "ko-KR-SunHiNeural",
501
+ # "Premwadee": "th-TH-PremwadeeNeural",
502
+ # etc. for all languages
503
+ }
504
+
505
+ # Re-create language_dict for dropdown population if needed, but the core TTS will use voice_options directly
506
+ language_dict = {
507
+ "Hindi": {"Madhur": "hi-IN-MadhurNeural", "Swara": "hi-IN-SwaraNeural"},
508
+ "English": { # Populate with the voices you want for English
509
+ "Jenny Female": "en-US-JennyNeural",
510
+ "Guy Male": "en-US-GuyNeural",
511
+ # ... and so on for all English voices
512
+ },
513
+ "Spanish": { # Populate with the voices you want for Spanish
514
+ "Elena": "es-AR-ElenaNeural",
515
+ "Tomas": "es-AR-TomasNeural",
516
+ # ... and so on for all Spanish voices
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
517
  },
518
+ # ... Continue with all other languages and their respective voices
519
+ # Ensure this matches the full language_dict you provided previously.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
520
  }
521
 
 
 
 
522
 
523
+ # Populate voice_options from language_dict
524
+ voice_options = {}
525
+ for lang, speakers in language_dict.items():
526
+ voice_options.update(speakers)
527
+
528
+ default_language = "English"
529
+ # Ensure default_speaker is a valid key from voice_options (e.g., "Jenny Female")
530
+ default_speaker_name = list(language_dict[default_language].keys())[0] # e.g., "Jenny Female"
531
+
532
+ def get_speakers_for_language(language):
533
  speakers = list(language_dict[language].keys())
534
  # Return gr.update to set choices and selected value
535
+ return gr.update(choices=speakers, value=speakers[0], interactive=True), gr.update(visible=language == "Arabic", interactive=True)
536
+
537
 
538
  atexit.register(file_manager.cleanup_all)
539
 
 
540
  with gr.Blocks(title="Advanced TTS with Configurable SRT Generation",
541
  css="""
542
  :root {
 
668
  display: none !important;
669
  }
670
  """
671
+ ) as app:
672
  gr.Markdown("# Advanced TTS with Configurable SRT Generation")
673
  gr.Markdown("Generate perfectly synchronized audio and subtitles with natural speech patterns.")
674
 
 
677
  text_input = gr.Textbox(label="Enter Text", lines=10, placeholder="Enter your text here...")
678
 
679
  with gr.Column(scale=2):
680
+ # Using your `language_dict` for dropdown population
681
+ language_dropdown = gr.Dropdown(
682
  label="Select Language",
683
  choices=list(language_dict.keys()),
684
  value=default_language,
685
  interactive=True
686
  )
687
+ # The speaker dropdown will be updated by the language_dropdown.change event
688
+ speaker_dropdown = gr.Dropdown(
689
  label="Select Voice",
690
+ choices=list(language_dict[default_language].keys()),
691
+ value=default_speaker_name,
692
+ interactive=True
693
  )
694
  pitch_slider = gr.Slider(
695
  label="Pitch Adjustment (Hz)",
 
731
  value=True,
732
  info="Process multiple segments simultaneously for faster conversion (recommended for longer texts)"
733
  )
734
+ # Tashkeel checkbox for Arabic
735
+ tashkeel_checkbox = gr.Checkbox(
736
  label="Tashkeel (Arabic Only)",
737
  value=False,
738
+ visible=False,
739
  interactive=True
740
  )
741
 
 
745
 
746
  with gr.Row():
747
  with gr.Column():
748
+ audio_preview = gr.Audio(label="Preview Audio") # Renamed for clarity
749
  with gr.Column():
750
+ # Use gr.HTML for download links, initially hidden
751
+ srt_download_html_output = gr.HTML(value="", visible=False)
752
+ audio_download_html_output = gr.HTML(value="", visible=False)
753
+
754
  # Event Handlers
755
+ language_dropdown.change(
756
+ fn=get_speakers_for_language, # Renamed function for clarity
757
+ inputs=[language_dropdown],
758
+ outputs=[speaker_dropdown, tashkeel_checkbox]
759
  )
760
 
761
  submit_btn.click(
 
764
  text_input,
765
  pitch_slider,
766
  rate_slider,
767
+ speaker_dropdown, # This now correctly passes the selected speaker name (e.g., "Jenny Female")
768
  words_per_line,
769
  lines_per_segment,
770
  parallel_processing
771
  ],
772
  outputs=[
773
+ audio_preview,
774
+ srt_download_html_output,
775
+ audio_download_html_output,
776
  error_output
777
  ],
778
  api_name="generate"