hivecorp commited on
Commit
81ee5ca
·
verified ·
1 Parent(s): 266d5cd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -16
app.py CHANGED
@@ -9,6 +9,11 @@ from concurrent.futures import ThreadPoolExecutor
9
  from typing import List, Tuple, Optional
10
  import math
11
  from dataclasses import dataclass
 
 
 
 
 
12
 
13
  class TimingManager:
14
  def __init__(self):
@@ -182,27 +187,78 @@ class TextProcessor:
182
 
183
  return lines
184
 
185
- async def process_segment_with_timing(segment: Segment, voice: str, rate: str, pitch: str) -> Segment:
186
- """Process a complete segment as a single TTS unit"""
187
- audio_file = f"temp_segment_{segment.id}_{uuid.uuid4()}.wav"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  try:
189
- # Process the entire segment text as one unit, replacing newlines with spaces
190
- segment_text = ' '.join(segment.text.split('\n'))
191
- tts = edge_tts.Communicate(segment_text, voice, rate=rate, pitch=pitch)
 
192
  await tts.save(audio_file)
193
 
194
  segment.audio = AudioSegment.from_file(audio_file)
195
- # Add small silence at start and end for natural spacing
196
- silence = AudioSegment.silent(duration=50)
197
- segment.audio = silence + segment.audio + silence
198
  segment.duration = len(segment.audio)
199
 
 
200
  return segment
 
 
 
201
  finally:
202
  if os.path.exists(audio_file):
203
  os.remove(audio_file)
204
 
205
- async def generate_accurate_srt(text: str, voice: str, rate: str, pitch: str, words_per_line: int, lines_per_segment: int) -> Tuple[str, str]:
206
  processor = TextProcessor(words_per_line, lines_per_segment)
207
  segments = processor.split_into_segments(text)
208
 
@@ -211,10 +267,11 @@ async def generate_accurate_srt(text: str, voice: str, rate: str, pitch: str, wo
211
  current_time = 0
212
  final_audio = AudioSegment.empty()
213
  srt_content = ""
 
214
 
215
- for segment in segments:
216
  # Process segment
217
- processed_segment = await process_segment_with_timing(segment, voice, rate, pitch)
218
 
219
  # Calculate precise timing
220
  processed_segment.start_time = current_time
@@ -252,7 +309,7 @@ async def generate_accurate_srt(text: str, voice: str, rate: str, pitch: str, wo
252
 
253
  return srt_path, audio_path
254
 
255
- async def process_text(text, pitch, rate, voice, words_per_line, lines_per_segment):
256
  # Format pitch and rate strings
257
  pitch_str = f"{pitch:+d}Hz" if pitch != 0 else "+0Hz"
258
  rate_str = f"{rate:+d}%" if rate != 0 else "+0%"
@@ -263,7 +320,11 @@ async def process_text(text, pitch, rate, voice, words_per_line, lines_per_segme
263
  rate_str,
264
  pitch_str,
265
  words_per_line,
266
- lines_per_segment
 
 
 
 
267
  )
268
 
269
  return srt_path, audio_path, audio_path
@@ -320,12 +381,17 @@ app = gr.Interface(
320
  gr.Slider(label="Rate Adjustment (%)", minimum=-25, maximum=25, value=0, step=1),
321
  gr.Dropdown(label="Select Voice", choices=list(voice_options.keys()), value="Jenny Female"),
322
  gr.Slider(label="Words per Line", minimum=3, maximum=12, value=6, step=1),
323
- gr.Slider(label="Lines per Segment", minimum=1, maximum=4, value=2, step=1)
 
 
 
 
324
  ],
325
  outputs=[
326
  gr.File(label="Download SRT"),
327
  gr.File(label="Download Audio"),
328
- gr.Audio(label="Preview Audio")
 
329
  ],
330
  title="Advanced TTS with Configurable SRT Generation",
331
  description="Generate perfectly synchronized audio and subtitles with natural speech patterns."
 
9
  from typing import List, Tuple, Optional
10
  import math
11
  from dataclasses import dataclass
12
+ import hashlib
13
+ import json
14
+ from pathlib import Path
15
+ from tqdm.asyncio import tqdm
16
+ import ssml.builder as ssml
17
 
18
  class TimingManager:
19
  def __init__(self):
 
187
 
188
  return lines
189
 
190
+ class AudioCache:
191
+ def __init__(self, cache_dir="./cache"):
192
+ self.cache_dir = Path(cache_dir)
193
+ self.cache_dir.mkdir(exist_ok=True)
194
+
195
+ def get_cache_key(self, text: str, voice: str, rate: str, pitch: str) -> str:
196
+ data = f"{text}{voice}{rate}{pitch}".encode()
197
+ return hashlib.md5(data).hexdigest()
198
+
199
+ def get_cached_audio(self, cache_key: str) -> Optional[AudioSegment]:
200
+ cache_file = self.cache_dir / f"{cache_key}.wav"
201
+ if cache_file.exists():
202
+ return AudioSegment.from_file(str(cache_file))
203
+ return None
204
+
205
+ def cache_audio(self, cache_key: str, audio: AudioSegment):
206
+ cache_file = self.cache_dir / f"{cache_key}.wav"
207
+ audio.export(str(cache_file), format="wav")
208
+
209
+ class SpeechEnhancer:
210
+ @staticmethod
211
+ def add_speech_marks(text: str) -> str:
212
+ """Add SSML marks for better speech control"""
213
+ speech = ssml.Speech()
214
+ # Add prosody and breaks for natural speech
215
+ speech.prosody(rate="medium", pitch="medium", volume="medium")
216
+ for sentence in text.split('. '):
217
+ speech.p(sentence.strip())
218
+ speech.break_("medium")
219
+ return str(speech)
220
+
221
+ @staticmethod
222
+ def enhance_timing(segment: Segment) -> Segment:
223
+ """Add natural pauses based on punctuation"""
224
+ if segment.audio:
225
+ for punct, pause_ms in {'.': 400, '!': 400, '?': 400, ',': 200, ';': 300}.items():
226
+ if punct in segment.text:
227
+ silence = AudioSegment.silent(duration=pause_ms)
228
+ segment.audio = segment.audio.append(silence, crossfade=50)
229
+ return segment
230
+
231
+ async def process_segment_with_timing(segment: Segment, voice: str, rate: str, pitch: str, cache: AudioCache) -> Segment:
232
+ """Process segment with enhanced speech features"""
233
+ cache_key = cache.get_cache_key(segment.text, voice, rate, pitch)
234
+ cached_audio = cache.get_cached_audio(cache_key)
235
+
236
+ if cached_audio:
237
+ segment.audio = cached_audio
238
+ segment.duration = len(cached_audio)
239
+ return segment
240
+
241
  try:
242
+ enhanced_text = SpeechEnhancer.add_speech_marks(segment.text)
243
+ tts = edge_tts.Communicate(enhanced_text, voice, rate=rate, pitch=pitch)
244
+
245
+ audio_file = f"temp_segment_{segment.id}_{uuid.uuid4()}.wav"
246
  await tts.save(audio_file)
247
 
248
  segment.audio = AudioSegment.from_file(audio_file)
249
+ segment = SpeechEnhancer.enhance_timing(segment)
 
 
250
  segment.duration = len(segment.audio)
251
 
252
+ cache.cache_audio(cache_key, segment.audio)
253
  return segment
254
+ except Exception as e:
255
+ print(f"Error processing segment {segment.id}: {str(e)}")
256
+ raise
257
  finally:
258
  if os.path.exists(audio_file):
259
  os.remove(audio_file)
260
 
261
+ async def generate_accurate_srt(text: str, voice: str, rate: str, pitch: str, words_per_line: int, lines_per_segment: int, enable_ssml: bool, use_cache: bool, pause_after_period: int, pause_after_comma: int) -> Tuple[str, str]:
262
  processor = TextProcessor(words_per_line, lines_per_segment)
263
  segments = processor.split_into_segments(text)
264
 
 
267
  current_time = 0
268
  final_audio = AudioSegment.empty()
269
  srt_content = ""
270
+ cache = AudioCache() if use_cache else None
271
 
272
+ for segment in tqdm(segments, desc="Processing segments"):
273
  # Process segment
274
+ processed_segment = await process_segment_with_timing(segment, voice, rate, pitch, cache)
275
 
276
  # Calculate precise timing
277
  processed_segment.start_time = current_time
 
309
 
310
  return srt_path, audio_path
311
 
312
+ async def process_text(text, pitch, rate, voice, words_per_line, lines_per_segment, enable_ssml, use_cache, pause_after_period, pause_after_comma):
313
  # Format pitch and rate strings
314
  pitch_str = f"{pitch:+d}Hz" if pitch != 0 else "+0Hz"
315
  rate_str = f"{rate:+d}%" if rate != 0 else "+0%"
 
320
  rate_str,
321
  pitch_str,
322
  words_per_line,
323
+ lines_per_segment,
324
+ enable_ssml,
325
+ use_cache,
326
+ pause_after_period,
327
+ pause_after_comma
328
  )
329
 
330
  return srt_path, audio_path, audio_path
 
381
  gr.Slider(label="Rate Adjustment (%)", minimum=-25, maximum=25, value=0, step=1),
382
  gr.Dropdown(label="Select Voice", choices=list(voice_options.keys()), value="Jenny Female"),
383
  gr.Slider(label="Words per Line", minimum=3, maximum=12, value=6, step=1),
384
+ gr.Slider(label="Lines per Segment", minimum=1, maximum=4, value=2, step=1),
385
+ gr.Checkbox(label="Enable SSML Enhancement", value=True),
386
+ gr.Checkbox(label="Use Audio Cache", value=True),
387
+ gr.Slider(label="Pause After Period (ms)", minimum=200, maximum=800, value=400, step=50),
388
+ gr.Slider(label="Pause After Comma (ms)", minimum=100, maximum=400, value=200, step=50)
389
  ],
390
  outputs=[
391
  gr.File(label="Download SRT"),
392
  gr.File(label="Download Audio"),
393
+ gr.Audio(label="Preview Audio"),
394
+ gr.HTML(label="Processing Status")
395
  ],
396
  title="Advanced TTS with Configurable SRT Generation",
397
  description="Generate perfectly synchronized audio and subtitles with natural speech patterns."