Update app.py
Browse files
app.py
CHANGED
|
@@ -9,6 +9,11 @@ from concurrent.futures import ThreadPoolExecutor
|
|
| 9 |
from typing import List, Tuple, Optional
|
| 10 |
import math
|
| 11 |
from dataclasses import dataclass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
class TimingManager:
|
| 14 |
def __init__(self):
|
|
@@ -182,27 +187,78 @@ class TextProcessor:
|
|
| 182 |
|
| 183 |
return lines
|
| 184 |
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
try:
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
|
|
|
| 192 |
await tts.save(audio_file)
|
| 193 |
|
| 194 |
segment.audio = AudioSegment.from_file(audio_file)
|
| 195 |
-
|
| 196 |
-
silence = AudioSegment.silent(duration=50)
|
| 197 |
-
segment.audio = silence + segment.audio + silence
|
| 198 |
segment.duration = len(segment.audio)
|
| 199 |
|
|
|
|
| 200 |
return segment
|
|
|
|
|
|
|
|
|
|
| 201 |
finally:
|
| 202 |
if os.path.exists(audio_file):
|
| 203 |
os.remove(audio_file)
|
| 204 |
|
| 205 |
-
async def generate_accurate_srt(text: str, voice: str, rate: str, pitch: str, words_per_line: int, lines_per_segment: int) -> Tuple[str, str]:
|
| 206 |
processor = TextProcessor(words_per_line, lines_per_segment)
|
| 207 |
segments = processor.split_into_segments(text)
|
| 208 |
|
|
@@ -211,10 +267,11 @@ async def generate_accurate_srt(text: str, voice: str, rate: str, pitch: str, wo
|
|
| 211 |
current_time = 0
|
| 212 |
final_audio = AudioSegment.empty()
|
| 213 |
srt_content = ""
|
|
|
|
| 214 |
|
| 215 |
-
for segment in segments:
|
| 216 |
# Process segment
|
| 217 |
-
processed_segment = await process_segment_with_timing(segment, voice, rate, pitch)
|
| 218 |
|
| 219 |
# Calculate precise timing
|
| 220 |
processed_segment.start_time = current_time
|
|
@@ -252,7 +309,7 @@ async def generate_accurate_srt(text: str, voice: str, rate: str, pitch: str, wo
|
|
| 252 |
|
| 253 |
return srt_path, audio_path
|
| 254 |
|
| 255 |
-
async def process_text(text, pitch, rate, voice, words_per_line, lines_per_segment):
|
| 256 |
# Format pitch and rate strings
|
| 257 |
pitch_str = f"{pitch:+d}Hz" if pitch != 0 else "+0Hz"
|
| 258 |
rate_str = f"{rate:+d}%" if rate != 0 else "+0%"
|
|
@@ -263,7 +320,11 @@ async def process_text(text, pitch, rate, voice, words_per_line, lines_per_segme
|
|
| 263 |
rate_str,
|
| 264 |
pitch_str,
|
| 265 |
words_per_line,
|
| 266 |
-
lines_per_segment
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
)
|
| 268 |
|
| 269 |
return srt_path, audio_path, audio_path
|
|
@@ -320,12 +381,17 @@ app = gr.Interface(
|
|
| 320 |
gr.Slider(label="Rate Adjustment (%)", minimum=-25, maximum=25, value=0, step=1),
|
| 321 |
gr.Dropdown(label="Select Voice", choices=list(voice_options.keys()), value="Jenny Female"),
|
| 322 |
gr.Slider(label="Words per Line", minimum=3, maximum=12, value=6, step=1),
|
| 323 |
-
gr.Slider(label="Lines per Segment", minimum=1, maximum=4, value=2, step=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
],
|
| 325 |
outputs=[
|
| 326 |
gr.File(label="Download SRT"),
|
| 327 |
gr.File(label="Download Audio"),
|
| 328 |
-
gr.Audio(label="Preview Audio")
|
|
|
|
| 329 |
],
|
| 330 |
title="Advanced TTS with Configurable SRT Generation",
|
| 331 |
description="Generate perfectly synchronized audio and subtitles with natural speech patterns."
|
|
|
|
| 9 |
from typing import List, Tuple, Optional
|
| 10 |
import math
|
| 11 |
from dataclasses import dataclass
|
| 12 |
+
import hashlib
|
| 13 |
+
import json
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from tqdm.asyncio import tqdm
|
| 16 |
+
import ssml.builder as ssml
|
| 17 |
|
| 18 |
class TimingManager:
|
| 19 |
def __init__(self):
|
|
|
|
| 187 |
|
| 188 |
return lines
|
| 189 |
|
| 190 |
+
class AudioCache:
|
| 191 |
+
def __init__(self, cache_dir="./cache"):
|
| 192 |
+
self.cache_dir = Path(cache_dir)
|
| 193 |
+
self.cache_dir.mkdir(exist_ok=True)
|
| 194 |
+
|
| 195 |
+
def get_cache_key(self, text: str, voice: str, rate: str, pitch: str) -> str:
|
| 196 |
+
data = f"{text}{voice}{rate}{pitch}".encode()
|
| 197 |
+
return hashlib.md5(data).hexdigest()
|
| 198 |
+
|
| 199 |
+
def get_cached_audio(self, cache_key: str) -> Optional[AudioSegment]:
|
| 200 |
+
cache_file = self.cache_dir / f"{cache_key}.wav"
|
| 201 |
+
if cache_file.exists():
|
| 202 |
+
return AudioSegment.from_file(str(cache_file))
|
| 203 |
+
return None
|
| 204 |
+
|
| 205 |
+
def cache_audio(self, cache_key: str, audio: AudioSegment):
|
| 206 |
+
cache_file = self.cache_dir / f"{cache_key}.wav"
|
| 207 |
+
audio.export(str(cache_file), format="wav")
|
| 208 |
+
|
| 209 |
+
class SpeechEnhancer:
|
| 210 |
+
@staticmethod
|
| 211 |
+
def add_speech_marks(text: str) -> str:
|
| 212 |
+
"""Add SSML marks for better speech control"""
|
| 213 |
+
speech = ssml.Speech()
|
| 214 |
+
# Add prosody and breaks for natural speech
|
| 215 |
+
speech.prosody(rate="medium", pitch="medium", volume="medium")
|
| 216 |
+
for sentence in text.split('. '):
|
| 217 |
+
speech.p(sentence.strip())
|
| 218 |
+
speech.break_("medium")
|
| 219 |
+
return str(speech)
|
| 220 |
+
|
| 221 |
+
@staticmethod
|
| 222 |
+
def enhance_timing(segment: Segment) -> Segment:
|
| 223 |
+
"""Add natural pauses based on punctuation"""
|
| 224 |
+
if segment.audio:
|
| 225 |
+
for punct, pause_ms in {'.': 400, '!': 400, '?': 400, ',': 200, ';': 300}.items():
|
| 226 |
+
if punct in segment.text:
|
| 227 |
+
silence = AudioSegment.silent(duration=pause_ms)
|
| 228 |
+
segment.audio = segment.audio.append(silence, crossfade=50)
|
| 229 |
+
return segment
|
| 230 |
+
|
| 231 |
+
async def process_segment_with_timing(segment: Segment, voice: str, rate: str, pitch: str, cache: AudioCache) -> Segment:
|
| 232 |
+
"""Process segment with enhanced speech features"""
|
| 233 |
+
cache_key = cache.get_cache_key(segment.text, voice, rate, pitch)
|
| 234 |
+
cached_audio = cache.get_cached_audio(cache_key)
|
| 235 |
+
|
| 236 |
+
if cached_audio:
|
| 237 |
+
segment.audio = cached_audio
|
| 238 |
+
segment.duration = len(cached_audio)
|
| 239 |
+
return segment
|
| 240 |
+
|
| 241 |
try:
|
| 242 |
+
enhanced_text = SpeechEnhancer.add_speech_marks(segment.text)
|
| 243 |
+
tts = edge_tts.Communicate(enhanced_text, voice, rate=rate, pitch=pitch)
|
| 244 |
+
|
| 245 |
+
audio_file = f"temp_segment_{segment.id}_{uuid.uuid4()}.wav"
|
| 246 |
await tts.save(audio_file)
|
| 247 |
|
| 248 |
segment.audio = AudioSegment.from_file(audio_file)
|
| 249 |
+
segment = SpeechEnhancer.enhance_timing(segment)
|
|
|
|
|
|
|
| 250 |
segment.duration = len(segment.audio)
|
| 251 |
|
| 252 |
+
cache.cache_audio(cache_key, segment.audio)
|
| 253 |
return segment
|
| 254 |
+
except Exception as e:
|
| 255 |
+
print(f"Error processing segment {segment.id}: {str(e)}")
|
| 256 |
+
raise
|
| 257 |
finally:
|
| 258 |
if os.path.exists(audio_file):
|
| 259 |
os.remove(audio_file)
|
| 260 |
|
| 261 |
+
async def generate_accurate_srt(text: str, voice: str, rate: str, pitch: str, words_per_line: int, lines_per_segment: int, enable_ssml: bool, use_cache: bool, pause_after_period: int, pause_after_comma: int) -> Tuple[str, str]:
|
| 262 |
processor = TextProcessor(words_per_line, lines_per_segment)
|
| 263 |
segments = processor.split_into_segments(text)
|
| 264 |
|
|
|
|
| 267 |
current_time = 0
|
| 268 |
final_audio = AudioSegment.empty()
|
| 269 |
srt_content = ""
|
| 270 |
+
cache = AudioCache() if use_cache else None
|
| 271 |
|
| 272 |
+
for segment in tqdm(segments, desc="Processing segments"):
|
| 273 |
# Process segment
|
| 274 |
+
processed_segment = await process_segment_with_timing(segment, voice, rate, pitch, cache)
|
| 275 |
|
| 276 |
# Calculate precise timing
|
| 277 |
processed_segment.start_time = current_time
|
|
|
|
| 309 |
|
| 310 |
return srt_path, audio_path
|
| 311 |
|
| 312 |
+
async def process_text(text, pitch, rate, voice, words_per_line, lines_per_segment, enable_ssml, use_cache, pause_after_period, pause_after_comma):
|
| 313 |
# Format pitch and rate strings
|
| 314 |
pitch_str = f"{pitch:+d}Hz" if pitch != 0 else "+0Hz"
|
| 315 |
rate_str = f"{rate:+d}%" if rate != 0 else "+0%"
|
|
|
|
| 320 |
rate_str,
|
| 321 |
pitch_str,
|
| 322 |
words_per_line,
|
| 323 |
+
lines_per_segment,
|
| 324 |
+
enable_ssml,
|
| 325 |
+
use_cache,
|
| 326 |
+
pause_after_period,
|
| 327 |
+
pause_after_comma
|
| 328 |
)
|
| 329 |
|
| 330 |
return srt_path, audio_path, audio_path
|
|
|
|
| 381 |
gr.Slider(label="Rate Adjustment (%)", minimum=-25, maximum=25, value=0, step=1),
|
| 382 |
gr.Dropdown(label="Select Voice", choices=list(voice_options.keys()), value="Jenny Female"),
|
| 383 |
gr.Slider(label="Words per Line", minimum=3, maximum=12, value=6, step=1),
|
| 384 |
+
gr.Slider(label="Lines per Segment", minimum=1, maximum=4, value=2, step=1),
|
| 385 |
+
gr.Checkbox(label="Enable SSML Enhancement", value=True),
|
| 386 |
+
gr.Checkbox(label="Use Audio Cache", value=True),
|
| 387 |
+
gr.Slider(label="Pause After Period (ms)", minimum=200, maximum=800, value=400, step=50),
|
| 388 |
+
gr.Slider(label="Pause After Comma (ms)", minimum=100, maximum=400, value=200, step=50)
|
| 389 |
],
|
| 390 |
outputs=[
|
| 391 |
gr.File(label="Download SRT"),
|
| 392 |
gr.File(label="Download Audio"),
|
| 393 |
+
gr.Audio(label="Preview Audio"),
|
| 394 |
+
gr.HTML(label="Processing Status")
|
| 395 |
],
|
| 396 |
title="Advanced TTS with Configurable SRT Generation",
|
| 397 |
description="Generate perfectly synchronized audio and subtitles with natural speech patterns."
|