hivecorp commited on
Commit
9a83649
·
verified ·
1 Parent(s): bcbb7e7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +411 -179
app.py CHANGED
@@ -12,9 +12,6 @@ from typing import List, Tuple, Optional, Dict, Any
12
  import math
13
  from dataclasses import dataclass
14
 
15
- # No changes to these classes and helper functions
16
- # (TimingManager, Segment, TextProcessor, TTSError, etc.)
17
- # ...
18
  class TimingManager:
19
  def __init__(self):
20
  self.current_time = 0
@@ -44,115 +41,179 @@ class Segment:
44
  end_time: int = 0
45
  duration: int = 0
46
  audio: Optional[AudioSegment] = None
47
- lines: List[str] = None
48
 
49
  class TextProcessor:
50
  def __init__(self, words_per_line: int, lines_per_segment: int):
51
  self.words_per_line = words_per_line
52
  self.lines_per_segment = lines_per_segment
53
  self.min_segment_words = 3
54
- self.max_segment_words = words_per_line * lines_per_segment * 1.5
55
  self.punctuation_weights = {
56
- '.': 1.0, '!': 1.0, '?': 1.0, ';': 0.8, ':': 0.7,
57
- ',': 0.5, '-': 0.3, '(': 0.2, ')': 0.2
 
 
 
 
 
 
 
58
  }
59
 
60
  def analyze_sentence_complexity(self, text: str) -> float:
 
61
  words = text.split()
62
- if not words: return 1.0
63
  complexity = 1.0
 
 
64
  if len(words) > self.words_per_line * 2:
65
  complexity *= 1.2
 
 
66
  punct_count = sum(text.count(p) for p in self.punctuation_weights.keys())
67
  complexity *= (1 + (punct_count / len(words)) * 0.5)
 
68
  return complexity
69
 
70
  def find_natural_breaks(self, text: str) -> List[Tuple[int, float]]:
 
71
  breaks = []
72
  words = text.split()
 
73
  for i, word in enumerate(words):
74
  weight = 0
 
 
75
  for punct, punct_weight in self.punctuation_weights.items():
76
  if word.endswith(punct):
77
  weight = max(weight, punct_weight)
 
 
78
  phrase_starters = {'however', 'therefore', 'moreover', 'furthermore', 'meanwhile', 'although', 'because'}
79
  if i < len(words) - 1 and words[i+1].lower() in phrase_starters:
80
  weight = max(weight, 0.6)
 
 
81
  if i > self.min_segment_words:
82
  conjunctions = {'and', 'but', 'or', 'nor', 'for', 'yet', 'so'}
83
  if word.lower() in conjunctions:
84
  weight = max(weight, 0.4)
 
85
  if weight > 0:
86
  breaks.append((i, weight))
 
87
  return breaks
88
 
89
  def split_into_segments(self, text: str) -> List[Segment]:
 
90
  text = re.sub(r'\s+', ' ', text.strip())
91
  text = re.sub(r'([.!?,;:])\s*', r'\1 ', text)
92
  text = re.sub(r'\s+([.!?,;:])', r'\1', text)
 
 
93
  segments = []
 
 
94
  words = text.split()
 
95
  i = 0
96
  while i < len(words):
97
- chunk_end = i + int(self.max_segment_words)
98
- chunk_text = ' '.join(words[i:chunk_end])
99
- complexity = self.analyze_sentence_complexity(chunk_text)
100
- breaks = self.find_natural_breaks(chunk_text)
101
- best_break = -1
102
- best_weight = -1
103
- ideal_length = self.words_per_line * self.lines_per_segment
104
  for break_idx, weight in breaks:
105
- distance_penalty = 1 - (abs(break_idx - ideal_length) / ideal_length) * 0.5
106
- score = weight * distance_penalty
107
- if score > best_weight:
108
- best_break = break_idx
109
- best_weight = score
110
- if best_break == -1:
111
- best_break = min(ideal_length, len(words) - 1 - i)
112
- segment_words = words[i : i + best_break + 1]
 
 
 
 
 
113
  segment_text = ' '.join(segment_words)
 
 
114
  lines = self.split_into_lines(segment_text)
115
  final_segment_text = '\n'.join(lines)
116
- segments.append(Segment(id=len(segments) + 1, text=final_segment_text))
 
 
 
 
 
117
  i += best_break + 1
 
118
  return segments
119
 
120
  def split_into_lines(self, text: str) -> List[str]:
 
121
  words = text.split()
122
  lines = []
123
  current_line = []
124
  word_count = 0
 
125
  for word in words:
126
  current_line.append(word)
127
  word_count += 1
128
- is_break = (word_count >= self.words_per_line or
129
- any(word.endswith(p) for p in '.!?') or
130
- (word_count >= self.words_per_line * 0.7 and
131
- any(word.endswith(p) for p in ',;:')))
132
- if is_break and len(words) > word_count:
 
 
 
 
 
133
  lines.append(' '.join(current_line))
134
  current_line = []
135
  word_count = 0
 
136
  if current_line:
137
  lines.append(' '.join(current_line))
 
138
  return lines
139
 
 
140
  class TTSError(Exception):
 
141
  pass
142
 
143
  async def process_segment_with_timing(segment: Segment, voice: str, rate: str, pitch: str) -> Segment:
144
- temp_dir = tempfile.gettempdir()
145
- audio_file = os.path.join(temp_dir, f"temp_segment_{segment.id}_{uuid.uuid4()}.wav")
146
  try:
 
147
  segment_text = ' '.join(segment.text.split('\n'))
148
  tts = edge_tts.Communicate(segment_text, voice, rate=rate, pitch=pitch)
149
- await tts.save(audio_file)
 
 
 
 
 
150
  if not os.path.exists(audio_file) or os.path.getsize(audio_file) == 0:
151
  raise TTSError(f"Generated audio file is empty or missing for segment {segment.id}")
152
- segment.audio = AudioSegment.from_file(audio_file)
153
- silence = AudioSegment.silent(duration=30)
154
- segment.audio = silence + segment.audio + silence
155
- segment.duration = len(segment.audio)
 
 
 
 
 
 
156
  return segment
157
  except Exception as e:
158
  if not isinstance(e, TTSError):
@@ -163,210 +224,328 @@ async def process_segment_with_timing(segment: Segment, voice: str, rate: str, p
163
  try:
164
  os.remove(audio_file)
165
  except Exception:
166
- pass
167
 
 
168
  class FileManager:
 
169
  def __init__(self):
170
  self.temp_dir = tempfile.mkdtemp(prefix="tts_app_")
171
  self.output_files = []
172
- self.max_files_to_keep = 5
173
 
 
 
 
 
174
  def create_output_paths(self):
 
175
  unique_id = str(uuid.uuid4())
176
  audio_path = os.path.join(self.temp_dir, f"final_audio_{unique_id}.mp3")
177
  srt_path = os.path.join(self.temp_dir, f"final_subtitles_{unique_id}.srt")
 
178
  self.output_files.append((srt_path, audio_path))
179
  self.cleanup_old_files()
 
180
  return srt_path, audio_path
181
 
182
  def cleanup_old_files(self):
 
183
  if len(self.output_files) > self.max_files_to_keep:
184
- old_files_to_remove = self.output_files[:-self.max_files_to_keep]
185
- for srt_path, audio_path in old_files_to_remove:
186
  try:
187
- if os.path.exists(srt_path): os.remove(srt_path)
188
- if os.path.exists(audio_path): os.remove(audio_path)
 
 
189
  except Exception:
190
- pass
 
 
191
  self.output_files = self.output_files[-self.max_files_to_keep:]
192
 
193
  def cleanup_all(self):
 
194
  for srt_path, audio_path in self.output_files:
195
  try:
196
- if os.path.exists(srt_path): os.remove(srt_path)
197
- if os.path.exists(audio_path): os.remove(audio_path)
 
 
198
  except Exception:
199
- pass
 
200
  try:
201
- if os.path.exists(self.temp_dir): os.rmdir(self.temp_dir)
202
  except Exception:
203
- pass
204
 
 
205
  file_manager = FileManager()
206
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  async def generate_accurate_srt(
208
- text: str, voice: str, rate: str, pitch: str,
209
- words_per_line: int, lines_per_segment: int,
210
- progress_callback=None, parallel: bool = True, max_workers: int = 4
 
 
 
 
 
 
211
  ) -> Tuple[str, str]:
 
212
  processor = TextProcessor(words_per_line, lines_per_segment)
213
  segments = processor.split_into_segments(text)
 
214
  total_segments = len(segments)
 
 
 
215
  if progress_callback:
216
  progress_callback(0.1, "Text segmentation complete")
217
- processed_segments = []
218
  if parallel and total_segments > 1:
219
- semaphore = asyncio.Semaphore(max_workers)
220
  processed_count = 0
 
 
 
 
 
221
  async def process_with_semaphore(segment):
222
  async with semaphore:
223
  nonlocal processed_count
224
- result = await process_segment_with_timing(segment, voice, rate, pitch)
225
- processed_count += 1
226
- if progress_callback:
227
- progress = 0.1 + (0.8 * processed_count / total_segments)
228
- progress_callback(progress, f"Processed {processed_count}/{total_segments} segments")
229
- return result
230
- tasks = [process_with_semaphore(s) for s in segments]
231
- results = await asyncio.gather(*tasks, return_exceptions=True)
232
- for res in results:
233
- if isinstance(res, Exception):
234
- raise TTSError(f"A task failed during parallel processing: {res}")
235
- processed_segments.append(res)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  else:
 
237
  for i, segment in enumerate(segments):
238
- processed_segment = await process_segment_with_timing(segment, voice, rate, pitch)
239
- processed_segments.append(processed_segment)
240
- if progress_callback:
241
- progress = 0.1 + (0.8 * (i + 1) / total_segments)
242
- progress_callback(progress, f"Processed {i + 1}/{total_segments} segments")
 
 
 
 
 
 
 
 
243
  processed_segments.sort(key=lambda s: s.id)
 
244
  if progress_callback:
245
  progress_callback(0.9, "Finalizing audio and subtitles")
 
 
246
  current_time = 0
247
  final_audio = AudioSegment.empty()
248
  srt_content = ""
 
249
  for segment in processed_segments:
 
250
  segment.start_time = current_time
251
  segment.end_time = current_time + segment.duration
252
- srt_content += f"{segment.id}\n{format_time_ms(segment.start_time)} --> {format_time_ms(segment.end_time)}\n{segment.text}\n\n"
 
 
 
 
 
 
 
 
253
  final_audio = final_audio.append(segment.audio, crossfade=0)
 
 
254
  current_time = segment.end_time
 
 
255
  srt_path, audio_path = file_manager.create_output_paths()
256
- export_params = {'format': 'mp3', 'bitrate': '192k', 'parameters': ['-ar', '44100', '-ac', '2', '-qscale:a', '2']}
257
- final_audio.export(audio_path, **export_params)
258
- with open(srt_path, "w", encoding='utf-8') as f: f.write(srt_content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  if progress_callback:
260
  progress_callback(1.0, "Complete!")
 
261
  return srt_path, audio_path
262
 
263
- ### MODIFICATION START ###
264
- # This new function creates the HTML for the download buttons using the JavaScript strategy.
265
- def create_download_links_html(srt_path: str, audio_path: str) -> str:
266
- """Generates an HTML string with JS-powered download links."""
267
- if not srt_path or not audio_path:
268
- return ""
269
-
270
- srt_filename = os.path.basename(srt_path)
271
- audio_filename = os.path.basename(audio_path)
272
-
273
- # This JavaScript function handles the download without navigating the page.
274
- js_download_logic = """
275
- event.preventDefault();
276
- fetch(this.href).then(resp => resp.blob()).then(blob => {
277
- const url = window.URL.createObjectURL(blob);
278
- const a = document.createElement('a');
279
- a.style.display = 'none';
280
- a.href = url;
281
- a.download = this.getAttribute('download');
282
- document.body.appendChild(a);
283
- a.click();
284
- window.URL.revokeObjectURL(url);
285
- document.body.removeChild(a);
286
- });
287
- """
288
-
289
- # Use the /file= relative path which Gradio provides for serving files.
290
- srt_url = f"/file={srt_path}"
291
- audio_url = f"/file={audio_path}"
292
-
293
- # Combine both links into a single HTML string.
294
- html = f"""
295
- <div style="text-align: center; padding: 10px 0;">
296
- <a href="{srt_url}" download="{srt_filename}" onclick="{js_download_logic}"
297
- style="display: inline-block; padding: 8px 15px; background-color: #0b5ed7; color: white; text-decoration: none; border-radius: 5px; font-weight: 600; margin-right: 15px; cursor: pointer;">
298
- 📥 Download SRT
299
- </a>
300
- <a href="{audio_url}" download="{audio_filename}" onclick="{js_download_logic}"
301
- style="display: inline-block; padding: 8px 15px; background-color: #0b5ed7; color: white; text-decoration: none; border-radius: 5px; font-weight: 600; cursor: pointer;">
302
- 📥 Download Audio
303
- </a>
304
- </div>
305
- """
306
- return html
307
-
308
- # This main processing function is now simplified.
309
  async def process_text_with_progress(
310
- text, pitch, rate, voice, words_per_line,
311
- lines_per_segment, parallel_processing,
 
 
 
 
 
312
  progress=gr.Progress()
313
  ):
314
- """
315
- Processes text, returns an audio path for the preview and an HTML string
316
- that contains either the download links or an error message.
317
- """
318
- # On validation failure, return None for the audio preview and an error HTML.
319
  if not text or text.strip() == "":
320
- return None, "<p style='color:red; text-align:center;'>Please enter some text to convert.</p>"
321
 
322
- pitch_str = f"{pitch:+d}Hz"
323
- rate_str = f"{rate:+d}%"
 
324
 
325
  try:
 
326
  progress(0, "Preparing text...")
327
 
328
  def update_progress(value, status):
329
  progress(value, status)
330
 
331
  srt_path, audio_path = await generate_accurate_srt(
332
- text, voice_options[voice], rate_str, pitch_str,
333
- words_per_line, lines_per_segment,
 
 
 
 
334
  progress_callback=update_progress,
335
  parallel=parallel_processing
336
  )
337
 
338
- # Get the JS-powered download links HTML.
339
- download_html = create_download_links_html(srt_path, audio_path)
340
-
341
- # Return the audio path for the player and the HTML for the download/status area.
342
- return audio_path, download_html
343
-
344
  except Exception as e:
345
- # On processing error, return None for audio and an error HTML.
346
- error_message = f"An error occurred: {str(e)}"
347
- return None, f"<p style='color:red; text-align:center;'>{error_message}</p>"
348
-
349
- ### MODIFICATION END ###
350
 
 
351
  voice_options = {
352
- "Andrew Male": "en-US-AndrewNeural", "Jenny Female": "en-US-JennyNeural", "Guy Male": "en-US-GuyNeural",
353
- "Ana Female": "en-US-AnaNeural", "Aria Female": "en-US-AriaNeural", "Brian Male": "en-US-BrianNeural",
354
- "Christopher Male": "en-US-ChristopherNeural", "Eric Male": "en-US-EricNeural", "Michelle Male": "en-US-MichelleNeural",
355
- "Roger Male": "en-US-RogerNeural", "Natasha Female": "en-AU-NatashaNeural", "William Male": "en-AU-WilliamNeural",
356
- "Clara Female": "en-CA-ClaraNeural", "Liam Female ": "en-CA-LiamNeural", "Libby Female": "en-GB-LibbyNeural",
357
- "Maisie": "en-GB-MaisieNeural", "Ryan": "en-GB-RyanNeural", "Sonia": "en-GB-SoniaNeural",
358
- "Thomas": "en-GB-ThomasNeural", "Sam": "en-HK-SamNeural", "Yan": "en-HK-YanNeural",
359
- "Connor": "en-IE-ConnorNeural", "Emily": "en-IE-EmilyNeural", "Neerja": "en-IN-NeerjaNeural",
360
- "Prabhat": "en-IN-PrabhatNeural", "Asilia": "en-KE-AsiliaNeural", "Chilemba": "en-KE-ChilembaNeural",
361
- "Abeo": "en-NG-AbeoNeural", "Ezinne": "en-NG-EzinneNeural", "Mitchell": "en-NZ-MitchellNeural",
362
- "James": "en-PH-JamesNeural", "Rosa": "en-PH-RosaNeural", "Luna": "en-SG-LunaNeural",
363
- "Wayne": "en-SG-WayneNeural", "Elimu": "en-TZ-ElimuNeural", "Imani": "en-TZ-ImaniNeural",
364
- "Leah": "en-ZA-LeahNeural", "Luke": "en-ZA-LukeNeural"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
  }
366
 
 
367
  import atexit
368
  atexit.register(file_manager.cleanup_all)
369
 
 
370
  with gr.Blocks(title="Advanced TTS with Configurable SRT Generation") as app:
371
  gr.Markdown("# Advanced TTS with Configurable SRT Generation")
372
  gr.Markdown("Generate perfectly synchronized audio and subtitles with natural speech patterns.")
@@ -374,45 +553,98 @@ with gr.Blocks(title="Advanced TTS with Configurable SRT Generation") as app:
374
  with gr.Row():
375
  with gr.Column(scale=3):
376
  text_input = gr.Textbox(label="Enter Text", lines=10, placeholder="Enter your text here...")
 
377
  with gr.Column(scale=2):
378
- voice_dropdown = gr.Dropdown(label="Select Voice", choices=list(voice_options.keys()), value="Jenny Female")
379
- pitch_slider = gr.Slider(label="Pitch Adjustment (Hz)", minimum=-10, maximum=10, value=0, step=1)
380
- rate_slider = gr.Slider(label="Rate Adjustment (%)", minimum=-25, maximum=25, value=0, step=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
 
382
  with gr.Row():
383
  with gr.Column():
384
- words_per_line = gr.Slider(label="Words per Line", minimum=3, maximum=12, value=6, step=1, info="Words per subtitle line.")
 
 
 
 
 
 
 
385
  with gr.Column():
386
- lines_per_segment = gr.Slider(label="Lines per Segment", minimum=1, maximum=4, value=2, step=1, info="Lines per subtitle block.")
 
 
 
 
 
 
 
387
  with gr.Column():
388
- parallel_processing = gr.Checkbox(label="Enable Parallel Processing", value=True, info="Faster conversion for longer texts.")
 
 
 
 
389
 
390
- submit_btn = gr.Button("Generate Audio & Subtitles", variant="primary")
 
 
 
391
 
392
- ### MODIFICATION START ###
393
- # The output area is simplified.
394
  with gr.Row():
395
- with gr.Column(scale=2):
396
- # This component is for the audio player preview.
397
- audio_preview = gr.Audio(label="Preview Audio")
398
- with gr.Column(scale=1):
399
- # This single HTML component will hold EITHER the download links OR an error message.
400
- status_and_download_output = gr.HTML(label="Status & Downloads")
401
-
402
- # The .click() event is now simpler and more robust.
 
 
 
 
 
 
 
 
 
 
403
  submit_btn.click(
404
  fn=process_text_with_progress,
405
  inputs=[
406
- text_input, pitch_slider, rate_slider, voice_dropdown,
407
- words_per_line, lines_per_segment, parallel_processing
 
 
 
 
 
408
  ],
409
  outputs=[
410
- audio_preview,
411
- status_and_download_output
 
 
 
412
  ],
413
  api_name="generate"
414
  )
415
- ### MODIFICATION END ###
416
 
417
  if __name__ == "__main__":
418
- app.launch()
 
12
  import math
13
  from dataclasses import dataclass
14
 
 
 
 
15
  class TimingManager:
16
  def __init__(self):
17
  self.current_time = 0
 
41
  end_time: int = 0
42
  duration: int = 0
43
  audio: Optional[AudioSegment] = None
44
+ lines: List[str] = None # Add lines field for display purposes only
45
 
46
  class TextProcessor:
47
  def __init__(self, words_per_line: int, lines_per_segment: int):
48
  self.words_per_line = words_per_line
49
  self.lines_per_segment = lines_per_segment
50
  self.min_segment_words = 3
51
+ self.max_segment_words = words_per_line * lines_per_segment * 1.5 # Allow 50% more for natural breaks
52
  self.punctuation_weights = {
53
+ '.': 1.0, # Strong break
54
+ '!': 1.0,
55
+ '?': 1.0,
56
+ ';': 0.8, # Medium-strong break
57
+ ':': 0.7,
58
+ ',': 0.5, # Medium break
59
+ '-': 0.3, # Weak break
60
+ '(': 0.2,
61
+ ')': 0.2
62
  }
63
 
64
  def analyze_sentence_complexity(self, text: str) -> float:
65
+ """Analyze sentence complexity to determine optimal segment length"""
66
  words = text.split()
 
67
  complexity = 1.0
68
+
69
+ # Adjust for sentence length
70
  if len(words) > self.words_per_line * 2:
71
  complexity *= 1.2
72
+
73
+ # Adjust for punctuation density
74
  punct_count = sum(text.count(p) for p in self.punctuation_weights.keys())
75
  complexity *= (1 + (punct_count / len(words)) * 0.5)
76
+
77
  return complexity
78
 
79
  def find_natural_breaks(self, text: str) -> List[Tuple[int, float]]:
80
+ """Find natural break points with their weights"""
81
  breaks = []
82
  words = text.split()
83
+
84
  for i, word in enumerate(words):
85
  weight = 0
86
+
87
+ # Check for punctuation
88
  for punct, punct_weight in self.punctuation_weights.items():
89
  if word.endswith(punct):
90
  weight = max(weight, punct_weight)
91
+
92
+ # Check for natural phrase boundaries
93
  phrase_starters = {'however', 'therefore', 'moreover', 'furthermore', 'meanwhile', 'although', 'because'}
94
  if i < len(words) - 1 and words[i+1].lower() in phrase_starters:
95
  weight = max(weight, 0.6)
96
+
97
+ # Check for conjunctions at natural points
98
  if i > self.min_segment_words:
99
  conjunctions = {'and', 'but', 'or', 'nor', 'for', 'yet', 'so'}
100
  if word.lower() in conjunctions:
101
  weight = max(weight, 0.4)
102
+
103
  if weight > 0:
104
  breaks.append((i, weight))
105
+
106
  return breaks
107
 
108
  def split_into_segments(self, text: str) -> List[Segment]:
109
+ # Normalize text and add proper spacing around punctuation
110
  text = re.sub(r'\s+', ' ', text.strip())
111
  text = re.sub(r'([.!?,;:])\s*', r'\1 ', text)
112
  text = re.sub(r'\s+([.!?,;:])', r'\1', text)
113
+
114
+ # First, split into major segments by strong punctuation
115
  segments = []
116
+ current_segment = []
117
+ current_text = ""
118
  words = text.split()
119
+
120
  i = 0
121
  while i < len(words):
122
+ complexity = self.analyze_sentence_complexity(' '.join(words[i:i + self.words_per_line * 2]))
123
+ breaks = self.find_natural_breaks(' '.join(words[i:i + int(self.max_segment_words * complexity)]))
124
+
125
+ # Find best break point
126
+ best_break = None
127
+ best_weight = 0
128
+
129
  for break_idx, weight in breaks:
130
+ actual_idx = i + break_idx
131
+ if (actual_idx - i >= self.min_segment_words and
132
+ actual_idx - i <= self.max_segment_words):
133
+ if weight > best_weight:
134
+ best_break = break_idx
135
+ best_weight = weight
136
+
137
+ if best_break is None:
138
+ # If no good break found, use maximum length
139
+ best_break = min(self.words_per_line * self.lines_per_segment, len(words) - i)
140
+
141
+ # Create segment
142
+ segment_words = words[i:i + best_break + 1]
143
  segment_text = ' '.join(segment_words)
144
+
145
+ # Split segment into lines
146
  lines = self.split_into_lines(segment_text)
147
  final_segment_text = '\n'.join(lines)
148
+
149
+ segments.append(Segment(
150
+ id=len(segments) + 1,
151
+ text=final_segment_text
152
+ ))
153
+
154
  i += best_break + 1
155
+
156
  return segments
157
 
158
  def split_into_lines(self, text: str) -> List[str]:
159
+ """Split segment text into natural lines"""
160
  words = text.split()
161
  lines = []
162
  current_line = []
163
  word_count = 0
164
+
165
  for word in words:
166
  current_line.append(word)
167
  word_count += 1
168
+
169
+ # Check for natural line breaks
170
+ is_break = (
171
+ word_count >= self.words_per_line or
172
+ any(word.endswith(p) for p in '.!?') or
173
+ (word_count >= self.words_per_line * 0.7 and
174
+ any(word.endswith(p) for p in ',;:'))
175
+ )
176
+
177
+ if is_break:
178
  lines.append(' '.join(current_line))
179
  current_line = []
180
  word_count = 0
181
+
182
  if current_line:
183
  lines.append(' '.join(current_line))
184
+
185
  return lines
186
 
187
+ # IMPROVEMENT 1: Enhanced Error Handling
188
  class TTSError(Exception):
189
+ """Custom exception for TTS processing errors"""
190
  pass
191
 
192
  async def process_segment_with_timing(segment: Segment, voice: str, rate: str, pitch: str) -> Segment:
193
+ """Process a complete segment as a single TTS unit with improved error handling"""
194
+ audio_file = os.path.join(tempfile.gettempdir(), f"temp_segment_{segment.id}_{uuid.uuid4()}.wav")
195
  try:
196
+ # Process the entire segment text as one unit, replacing newlines with spaces
197
  segment_text = ' '.join(segment.text.split('\n'))
198
  tts = edge_tts.Communicate(segment_text, voice, rate=rate, pitch=pitch)
199
+
200
+ try:
201
+ await tts.save(audio_file)
202
+ except Exception as e:
203
+ raise TTSError(f"Failed to generate audio for segment {segment.id}: {str(e)}")
204
+
205
  if not os.path.exists(audio_file) or os.path.getsize(audio_file) == 0:
206
  raise TTSError(f"Generated audio file is empty or missing for segment {segment.id}")
207
+
208
+ try:
209
+ segment.audio = AudioSegment.from_file(audio_file)
210
+ # Reduced silence to 30ms for more natural flow
211
+ silence = AudioSegment.silent(duration=30)
212
+ segment.audio = silence + segment.audio + silence
213
+ segment.duration = len(segment.audio)
214
+ except Exception as e:
215
+ raise TTSError(f"Failed to process audio file for segment {segment.id}: {str(e)}")
216
+
217
  return segment
218
  except Exception as e:
219
  if not isinstance(e, TTSError):
 
224
  try:
225
  os.remove(audio_file)
226
  except Exception:
227
+ pass # Ignore deletion errors
228
 
229
+ # IMPROVEMENT 2: Better File Management with cleanup
230
  class FileManager:
231
+ """Manages temporary and output files with cleanup capabilities"""
232
  def __init__(self):
233
  self.temp_dir = tempfile.mkdtemp(prefix="tts_app_")
234
  self.output_files = []
235
+ self.max_files_to_keep = 5 # Keep only the 5 most recent output pairs
236
 
237
+ def get_temp_path(self, prefix):
238
+ """Get a path for a temporary file"""
239
+ return os.path.join(self.temp_dir, f"{prefix}_{uuid.uuid4()}")
240
+
241
  def create_output_paths(self):
242
+ """Create paths for output files"""
243
  unique_id = str(uuid.uuid4())
244
  audio_path = os.path.join(self.temp_dir, f"final_audio_{unique_id}.mp3")
245
  srt_path = os.path.join(self.temp_dir, f"final_subtitles_{unique_id}.srt")
246
+
247
  self.output_files.append((srt_path, audio_path))
248
  self.cleanup_old_files()
249
+
250
  return srt_path, audio_path
251
 
252
  def cleanup_old_files(self):
253
+ """Clean up old output files, keeping only the most recent ones"""
254
  if len(self.output_files) > self.max_files_to_keep:
255
+ old_files = self.output_files[:-self.max_files_to_keep]
256
+ for srt_path, audio_path in old_files:
257
  try:
258
+ if os.path.exists(srt_path):
259
+ os.remove(srt_path)
260
+ if os.path.exists(audio_path):
261
+ os.remove(audio_path)
262
  except Exception:
263
+ pass # Ignore deletion errors
264
+
265
+ # Update the list to only include files we're keeping
266
  self.output_files = self.output_files[-self.max_files_to_keep:]
267
 
268
  def cleanup_all(self):
269
+ """Clean up all managed files"""
270
  for srt_path, audio_path in self.output_files:
271
  try:
272
+ if os.path.exists(srt_path):
273
+ os.remove(srt_path)
274
+ if os.path.exists(audio_path):
275
+ os.remove(audio_path)
276
  except Exception:
277
+ pass # Ignore deletion errors
278
+
279
  try:
280
+ os.rmdir(self.temp_dir)
281
  except Exception:
282
+ pass # Ignore if directory isn't empty or can't be removed
283
 
284
+ # Create global file manager
285
  file_manager = FileManager()
286
 
287
+ # This function generates an HTML download link.
288
+ # The `target="_blank"` attribute ensures that when this link is clicked,
289
+ # the download action opens in a new browser tab or window.
290
+ def create_download_link(audio_path):
291
+ if audio_path is None:
292
+ return None
293
+
294
+ filename = Path(audio_path).name
295
+ # Update URL format to match Gradio's file serving pattern
296
+ base_url = "aman18811-wfr-01.hf.space" # This base_url might need to be adjusted for your specific Gradio deployment
297
+ file_url = f"https://{base_url}/gradio_api/file={audio_path}"
298
+
299
+ return f"""
300
+ <a href="{file_url}"
301
+ download="{filename}"
302
+ target="_blank"
303
+ rel="noopener noreferrer"
304
+ style="display: inline-block; padding: 10px 20px; background: linear-gradient(135deg, #4776E6, #8E54E9); color: white; text-decoration: none; border-radius: 8px; font-weight: 600; transition: all 0.3s ease;"
305
+ onmouseover="this.style.transform='translateY(-2px)'; this.style.boxShadow='0 5px 15px rgba(71, 118, 230, 0.3)';"
306
+ onmouseout="this.style.transform='translateY(0)'; this.style.boxShadow='none';"
307
+ onclick="event.preventDefault(); fetch(this.href).then(resp => resp.blob()).then(blob => {{
308
+ const url = window.URL.createObjectURL(blob);
309
+ const a = document.createElement('a');
310
+ a.style.display = 'none';
311
+ a.href = url;
312
+ a.download = '{filename}';
313
+ document.body.appendChild(a);
314
+ a.click();
315
+ window.URL.revokeObjectURL(url);
316
+ document.body.removeChild(a);
317
+ }});">
318
+ Download Audio File
319
+ </a>
320
+ """
321
+
322
+ # IMPROVEMENT 3: Parallel Processing for Segments
323
  async def generate_accurate_srt(
324
+ text: str,
325
+ voice: str,
326
+ rate: str,
327
+ pitch: str,
328
+ words_per_line: int,
329
+ lines_per_segment: int,
330
+ progress_callback=None,
331
+ parallel: bool = True,
332
+ max_workers: int = 4
333
  ) -> Tuple[str, str]:
334
+ """Generate accurate SRT with parallel processing option"""
335
  processor = TextProcessor(words_per_line, lines_per_segment)
336
  segments = processor.split_into_segments(text)
337
+
338
  total_segments = len(segments)
339
+ processed_segments = []
340
+
341
+ # Update progress to show segmentation is complete
342
  if progress_callback:
343
  progress_callback(0.1, "Text segmentation complete")
344
+
345
  if parallel and total_segments > 1:
346
+ # Process segments in parallel
347
  processed_count = 0
348
+ segment_tasks = []
349
+
350
+ # Create a semaphore to limit concurrent tasks
351
+ semaphore = asyncio.Semaphore(max_workers)
352
+
353
  async def process_with_semaphore(segment):
354
  async with semaphore:
355
  nonlocal processed_count
356
+ try:
357
+ result = await process_segment_with_timing(segment, voice, rate, pitch)
358
+ processed_count += 1
359
+ if progress_callback:
360
+ progress = 0.1 + (0.8 * processed_count / total_segments)
361
+ progress_callback(progress, f"Processed {processed_count}/{total_segments} segments")
362
+ return result
363
+ except Exception as e:
364
+ # Handle errors in individual segments
365
+ processed_count += 1
366
+ if progress_callback:
367
+ progress = 0.1 + (0.8 * processed_count / total_segments)
368
+ progress_callback(progress, f"Error in segment {segment.id}: {str(e)}")
369
+ raise
370
+
371
+ # Create tasks for all segments
372
+ for segment in segments:
373
+ segment_tasks.append(process_with_semaphore(segment))
374
+
375
+ # Run all tasks and collect results
376
+ try:
377
+ processed_segments = await asyncio.gather(*segment_tasks)
378
+ except Exception as e:
379
+ if progress_callback:
380
+ progress_callback(0.9, f"Error during parallel processing: {str(e)}")
381
+ raise TTSError(f"Failed during parallel processing: {str(e)}")
382
  else:
383
+ # Process segments sequentially (original method)
384
  for i, segment in enumerate(segments):
385
+ try:
386
+ processed_segment = await process_segment_with_timing(segment, voice, rate, pitch)
387
+ processed_segments.append(processed_segment)
388
+
389
+ if progress_callback:
390
+ progress = 0.1 + (0.8 * (i + 1) / total_segments)
391
+ progress_callback(progress, f"Processed {i + 1}/{total_segments} segments")
392
+ except Exception as e:
393
+ if progress_callback:
394
+ progress_callback(0.9, f"Error processing segment {segment.id}: {str(e)}")
395
+ raise TTSError(f"Failed to process segment {segment.id}: {str(e)}")
396
+
397
+ # Sort segments by ID to ensure correct order
398
  processed_segments.sort(key=lambda s: s.id)
399
+
400
  if progress_callback:
401
  progress_callback(0.9, "Finalizing audio and subtitles")
402
+
403
+ # Now combine the segments in the correct order
404
  current_time = 0
405
  final_audio = AudioSegment.empty()
406
  srt_content = ""
407
+
408
  for segment in processed_segments:
409
+ # Calculate precise timing
410
  segment.start_time = current_time
411
  segment.end_time = current_time + segment.duration
412
+
413
+ # Add to SRT with precise timing
414
+ srt_content += (
415
+ f"{segment.id}\n"
416
+ f"{format_time_ms(segment.start_time)} --> {format_time_ms(segment.end_time)}\n"
417
+ f"{segment.text}\n\n"
418
+ )
419
+
420
+ # Add to final audio with precise positioning
421
  final_audio = final_audio.append(segment.audio, crossfade=0)
422
+
423
+ # Update timing with precise gap
424
  current_time = segment.end_time
425
+
426
+ # Export with high precision
427
  srt_path, audio_path = file_manager.create_output_paths()
428
+
429
+ try:
430
+ # Export with optimized quality settings and compression
431
+ export_params = {
432
+ 'format': 'mp3',
433
+ 'bitrate': '192k', # Reduced from 320k but still high quality
434
+ 'parameters': [
435
+ '-ar', '44100', # Standard sample rate
436
+ '-ac', '2', # Stereo
437
+ '-compression_level', '0', # Best compression
438
+ '-qscale:a', '2' # High quality VBR encoding
439
+ ]
440
+ }
441
+ final_audio.export(audio_path, **export_params)
442
+
443
+ with open(srt_path, "w", encoding='utf-8') as f:
444
+ f.write(srt_content)
445
+ except Exception as e:
446
+ if progress_callback:
447
+ progress_callback(1.0, f"Error exporting final files: {str(e)}")
448
+ raise TTSError(f"Failed to export final files: {str(e)}")
449
+
450
  if progress_callback:
451
  progress_callback(1.0, "Complete!")
452
+
453
  return srt_path, audio_path
454
 
455
+ # IMPROVEMENT 4: Progress Reporting with proper error handling for older Gradio versions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
456
  async def process_text_with_progress(
457
+ text,
458
+ pitch,
459
+ rate,
460
+ voice,
461
+ words_per_line,
462
+ lines_per_segment,
463
+ parallel_processing,
464
  progress=gr.Progress()
465
  ):
466
+ # Input validation
 
 
 
 
467
  if not text or text.strip() == "":
468
+ return None, None, None, True, "Please enter some text to convert to speech."
469
 
470
+ # Format pitch and rate strings
471
+ pitch_str = f"{pitch:+d}Hz" if pitch != 0 else "+0Hz"
472
+ rate_str = f"{rate:+d}%" if rate != 0 else "+0%"
473
 
474
  try:
475
+ # Start progress tracking
476
  progress(0, "Preparing text...")
477
 
478
  def update_progress(value, status):
479
  progress(value, status)
480
 
481
  srt_path, audio_path = await generate_accurate_srt(
482
+ text,
483
+ voice_options[voice],
484
+ rate_str,
485
+ pitch_str,
486
+ words_per_line,
487
+ lines_per_segment,
488
  progress_callback=update_progress,
489
  parallel=parallel_processing
490
  )
491
 
492
+ # If successful, return results and hide error
493
+ return srt_path, audio_path, audio_path, False, ""
494
+ except TTSError as e:
495
+ # Return specific TTS error
496
+ return None, None, None, True, f"TTS Error: {str(e)}"
 
497
  except Exception as e:
498
+ # Return any other error
499
+ return None, None, None, True, f"Unexpected error: {str(e)}"
 
 
 
500
 
501
+ # Voice options dictionary
502
  voice_options = {
503
+ "Andrew Male": "en-US-AndrewNeural",
504
+ "Jenny Female": "en-US-JennyNeural",
505
+ "Guy Male": "en-US-GuyNeural",
506
+ "Ana Female": "en-US-AnaNeural",
507
+ "Aria Female": "en-US-AriaNeural",
508
+ "Brian Male": "en-US-BrianNeural",
509
+ "Christopher Male": "en-US-ChristopherNeural",
510
+ "Eric Male": "en-US-EricNeural",
511
+ "Michelle Male": "en-US-MichelleNeural",
512
+ "Roger Male": "en-US-RogerNeural",
513
+ "Natasha Female": "en-AU-NatashaNeural",
514
+ "William Male": "en-AU-WilliamNeural",
515
+ "Clara Female": "en-CA-ClaraNeural",
516
+ "Liam Female ": "en-CA-LiamNeural",
517
+ "Libby Female": "en-GB-LibbyNeural",
518
+ "Maisie": "en-GB-MaisieNeural",
519
+ "Ryan": "en-GB-RyanNeural",
520
+ "Sonia": "en-GB-SoniaNeural",
521
+ "Thomas": "en-GB-ThomasNeural",
522
+ "Sam": "en-HK-SamNeural",
523
+ "Yan": "en-HK-YanNeural",
524
+ "Connor": "en-IE-ConnorNeural",
525
+ "Emily": "en-IE-EmilyNeural",
526
+ "Neerja": "en-IN-NeerjaNeural",
527
+ "Prabhat": "en-IN-PrabhatNeural",
528
+ "Asilia": "en-KE-AsiliaNeural",
529
+ "Chilemba": "en-KE-ChilembaNeural",
530
+ "Abeo": "en-NG-AbeoNeural",
531
+ "Ezinne": "en-NG-EzinneNeural",
532
+ "Mitchell": "en-NZ-MitchellNeural",
533
+ "James": "en-PH-JamesNeural",
534
+ "Rosa": "en-PH-RosaNeural",
535
+ "Luna": "en-SG-LunaNeural",
536
+ "Wayne": "en-SG-WayneNeural",
537
+ "Elimu": "en-TZ-ElimuNeural",
538
+ "Imani": "en-TZ-ImaniNeural",
539
+ "Leah": "en-ZA-LeahNeural",
540
+ "Luke": "en-ZA-LukeNeural"
541
+ # Add other voices as needed
542
  }
543
 
544
+ # Register cleanup on exit
545
  import atexit
546
  atexit.register(file_manager.cleanup_all)
547
 
548
+ # Create Gradio interface
549
  with gr.Blocks(title="Advanced TTS with Configurable SRT Generation") as app:
550
  gr.Markdown("# Advanced TTS with Configurable SRT Generation")
551
  gr.Markdown("Generate perfectly synchronized audio and subtitles with natural speech patterns.")
 
553
  with gr.Row():
554
  with gr.Column(scale=3):
555
  text_input = gr.Textbox(label="Enter Text", lines=10, placeholder="Enter your text here...")
556
+
557
  with gr.Column(scale=2):
558
+ voice_dropdown = gr.Dropdown(
559
+ label="Select Voice",
560
+ choices=list(voice_options.keys()),
561
+ value="Jenny Female"
562
+ )
563
+ pitch_slider = gr.Slider(
564
+ label="Pitch Adjustment (Hz)",
565
+ minimum=-10,
566
+ maximum=10,
567
+ value=0,
568
+ step=1
569
+ )
570
+ rate_slider = gr.Slider(
571
+ label="Rate Adjustment (%)",
572
+ minimum=-25,
573
+ maximum=25,
574
+ value=0,
575
+ step=1
576
+ )
577
 
578
  with gr.Row():
579
  with gr.Column():
580
+ words_per_line = gr.Slider(
581
+ label="Words per Line",
582
+ minimum=3,
583
+ maximum=12,
584
+ value=6,
585
+ step=1,
586
+ info="Controls how many words appear on each line of the subtitle"
587
+ )
588
  with gr.Column():
589
+ lines_per_segment = gr.Slider(
590
+ label="Lines per Segment",
591
+ minimum=1,
592
+ maximum=4,
593
+ value=2,
594
+ step=1,
595
+ info="Controls how many lines appear in each subtitle segment"
596
+ )
597
  with gr.Column():
598
+ parallel_processing = gr.Checkbox(
599
+ label="Enable Parallel Processing",
600
+ value=True,
601
+ info="Process multiple segments simultaneously for faster conversion (recommended for longer texts)"
602
+ )
603
 
604
+ submit_btn = gr.Button("Generate Audio & Subtitles")
605
+
606
+ # Add error message component
607
+ error_output = gr.Textbox(label="Status", visible=False)
608
 
 
 
609
  with gr.Row():
610
+ with gr.Column():
611
+ audio_output = gr.Audio(label="Preview Audio")
612
+ with gr.Column():
613
+ srt_file = gr.File(label="Download SRT")
614
+ # The download_link HTML component will contain an <a> tag with target="_blank"
615
+ # This ensures that when the generated audio/SRT is downloaded via this link,
616
+ # it will open in a new browser tab.
617
+ download_link = gr.HTML(elem_classes="download-btn")
618
+ # The audio_file component is typically for direct download via Gradio's file handling,
619
+ # which might not open a new tab depending on browser settings.
620
+ # The HTML download_link provides more control over opening in a new tab.
621
+ audio_file = gr.File(label="Download Audio (Direct)")
622
+
623
+ # Handle button click with manual error handling instead of .catch()
624
+ # When submit_btn is clicked, it calls process_text_with_progress.
625
+ # This function processes the inputs and updates the outputs on the *current* Gradio page.
626
+ # It does NOT open a new page itself.
627
+ # The 'download_link' HTML output, however, contains an <a> tag designed to open in a new tab.
628
  submit_btn.click(
629
  fn=process_text_with_progress,
630
  inputs=[
631
+ text_input,
632
+ pitch_slider,
633
+ rate_slider,
634
+ voice_dropdown,
635
+ words_per_line,
636
+ lines_per_segment,
637
+ parallel_processing
638
  ],
639
  outputs=[
640
+ srt_file,
641
+ audio_file,
642
+ audio_output,
643
+ error_output,
644
+ download_link # Ensure download_link is updated with the new HTML for download
645
  ],
646
  api_name="generate"
647
  )
 
648
 
649
  if __name__ == "__main__":
650
+ app.launch()