Update app.py
Browse files
app.py
CHANGED
|
@@ -20,7 +20,7 @@ class TimingManager:
|
|
| 20 |
def get_timing(self, duration):
|
| 21 |
start_time = self.current_time
|
| 22 |
end_time = start_time + duration
|
| 23 |
-
self.current_time = end_time +
|
| 24 |
return start_time, end_time
|
| 25 |
|
| 26 |
def get_audio_length(audio_file):
|
|
@@ -184,7 +184,6 @@ class TextProcessor:
|
|
| 184 |
|
| 185 |
return lines
|
| 186 |
|
| 187 |
-
# IMPROVEMENT 1: Enhanced Error Handling
|
| 188 |
class TTSError(Exception):
|
| 189 |
"""Custom exception for TTS processing errors"""
|
| 190 |
pass
|
|
@@ -226,7 +225,6 @@ async def process_segment_with_timing(segment: Segment, voice: str, rate: str, p
|
|
| 226 |
except Exception:
|
| 227 |
pass # Ignore deletion errors
|
| 228 |
|
| 229 |
-
# IMPROVEMENT 2: Better File Management with cleanup
|
| 230 |
class FileManager:
|
| 231 |
"""Manages temporary and output files with cleanup capabilities"""
|
| 232 |
def __init__(self):
|
|
@@ -281,10 +279,8 @@ class FileManager:
|
|
| 281 |
except Exception:
|
| 282 |
pass # Ignore if directory isn't empty or can't be removed
|
| 283 |
|
| 284 |
-
# Create global file manager
|
| 285 |
file_manager = FileManager()
|
| 286 |
|
| 287 |
-
# IMPROVEMENT 3: Parallel Processing for Segments
|
| 288 |
async def generate_accurate_srt(
|
| 289 |
text: str,
|
| 290 |
voice: str,
|
|
@@ -303,16 +299,12 @@ async def generate_accurate_srt(
|
|
| 303 |
total_segments = len(segments)
|
| 304 |
processed_segments = []
|
| 305 |
|
| 306 |
-
# Update progress to show segmentation is complete
|
| 307 |
if progress_callback:
|
| 308 |
progress_callback(0.1, "Text segmentation complete")
|
| 309 |
|
| 310 |
if parallel and total_segments > 1:
|
| 311 |
-
# Process segments in parallel
|
| 312 |
processed_count = 0
|
| 313 |
segment_tasks = []
|
| 314 |
-
|
| 315 |
-
# Create a semaphore to limit concurrent tasks
|
| 316 |
semaphore = asyncio.Semaphore(max_workers)
|
| 317 |
|
| 318 |
async def process_with_semaphore(segment):
|
|
@@ -326,18 +318,15 @@ async def generate_accurate_srt(
|
|
| 326 |
progress_callback(progress, f"Processed {processed_count}/{total_segments} segments")
|
| 327 |
return result
|
| 328 |
except Exception as e:
|
| 329 |
-
# Handle errors in individual segments
|
| 330 |
processed_count += 1
|
| 331 |
if progress_callback:
|
| 332 |
progress = 0.1 + (0.8 * processed_count / total_segments)
|
| 333 |
progress_callback(progress, f"Error in segment {segment.id}: {str(e)}")
|
| 334 |
raise
|
| 335 |
|
| 336 |
-
# Create tasks for all segments
|
| 337 |
for segment in segments:
|
| 338 |
segment_tasks.append(process_with_semaphore(segment))
|
| 339 |
|
| 340 |
-
# Run all tasks and collect results
|
| 341 |
try:
|
| 342 |
processed_segments = await asyncio.gather(*segment_tasks)
|
| 343 |
except Exception as e:
|
|
@@ -345,7 +334,6 @@ async def generate_accurate_srt(
|
|
| 345 |
progress_callback(0.9, f"Error during parallel processing: {str(e)}")
|
| 346 |
raise TTSError(f"Failed during parallel processing: {str(e)}")
|
| 347 |
else:
|
| 348 |
-
# Process segments sequentially (original method)
|
| 349 |
for i, segment in enumerate(segments):
|
| 350 |
try:
|
| 351 |
processed_segment = await process_segment_with_timing(segment, voice, rate, pitch)
|
|
@@ -359,48 +347,39 @@ async def generate_accurate_srt(
|
|
| 359 |
progress_callback(0.9, f"Error processing segment {segment.id}: {str(e)}")
|
| 360 |
raise TTSError(f"Failed to process segment {segment.id}: {str(e)}")
|
| 361 |
|
| 362 |
-
# Sort segments by ID to ensure correct order
|
| 363 |
processed_segments.sort(key=lambda s: s.id)
|
| 364 |
|
| 365 |
if progress_callback:
|
| 366 |
progress_callback(0.9, "Finalizing audio and subtitles")
|
| 367 |
|
| 368 |
-
# Now combine the segments in the correct order
|
| 369 |
current_time = 0
|
| 370 |
final_audio = AudioSegment.empty()
|
| 371 |
srt_content = ""
|
| 372 |
|
| 373 |
for segment in processed_segments:
|
| 374 |
-
# Calculate precise timing
|
| 375 |
segment.start_time = current_time
|
| 376 |
segment.end_time = current_time + segment.duration
|
| 377 |
|
| 378 |
-
# Add to SRT with precise timing
|
| 379 |
srt_content += (
|
| 380 |
f"{segment.id}\n"
|
| 381 |
f"{format_time_ms(segment.start_time)} --> {format_time_ms(segment.end_time)}\n"
|
| 382 |
f"{segment.text}\n\n"
|
| 383 |
)
|
| 384 |
|
| 385 |
-
# Add to final audio with precise positioning
|
| 386 |
final_audio = final_audio.append(segment.audio, crossfade=0)
|
| 387 |
-
|
| 388 |
-
# Update timing with precise gap
|
| 389 |
current_time = segment.end_time
|
| 390 |
|
| 391 |
-
# Export with high precision
|
| 392 |
srt_path, audio_path = file_manager.create_output_paths()
|
| 393 |
|
| 394 |
try:
|
| 395 |
-
# Export with optimized quality settings and compression
|
| 396 |
export_params = {
|
| 397 |
'format': 'mp3',
|
| 398 |
-
'bitrate': '192k',
|
| 399 |
'parameters': [
|
| 400 |
-
'-ar', '44100',
|
| 401 |
-
'-ac', '2',
|
| 402 |
-
'-compression_level', '0',
|
| 403 |
-
'-qscale:a', '2'
|
| 404 |
]
|
| 405 |
}
|
| 406 |
final_audio.export(audio_path, **export_params)
|
|
@@ -417,6 +396,7 @@ async def generate_accurate_srt(
|
|
| 417 |
|
| 418 |
return srt_path, audio_path
|
| 419 |
|
|
|
|
| 420 |
async def process_text_with_progress(
|
| 421 |
text,
|
| 422 |
pitch,
|
|
@@ -427,31 +407,26 @@ async def process_text_with_progress(
|
|
| 427 |
parallel_processing,
|
| 428 |
progress=gr.Progress()
|
| 429 |
):
|
| 430 |
-
# Initialize outputs to their default 'hidden' state
|
| 431 |
-
#
|
| 432 |
-
# gr.
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
output_error = ""
|
| 437 |
|
| 438 |
# Input validation
|
| 439 |
if not text or text.strip() == "":
|
| 440 |
-
output_error = "Please enter some text to convert to speech."
|
| 441 |
-
# Update visibility of error_output only when an error occurs
|
| 442 |
return (
|
| 443 |
output_audio,
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
gr.update(value=
|
| 447 |
)
|
| 448 |
|
| 449 |
-
# Format pitch and rate strings
|
| 450 |
pitch_str = f"{pitch:+d}Hz" if pitch != 0 else "+0Hz"
|
| 451 |
rate_str = f"{rate:+d}%" if rate != 0 else "+0%"
|
| 452 |
|
| 453 |
try:
|
| 454 |
-
# Start progress tracking
|
| 455 |
progress(0, "Preparing text...")
|
| 456 |
|
| 457 |
def update_progress(value, status):
|
|
@@ -468,50 +443,106 @@ async def process_text_with_progress(
|
|
| 468 |
parallel=parallel_processing
|
| 469 |
)
|
| 470 |
|
| 471 |
-
#
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 479 |
return (
|
| 480 |
-
|
| 481 |
-
gr.update(value=
|
| 482 |
-
gr.update(value=
|
| 483 |
-
gr.update(value=
|
| 484 |
)
|
| 485 |
except TTSError as e:
|
| 486 |
-
|
| 487 |
except Exception as e:
|
| 488 |
-
|
| 489 |
|
| 490 |
-
# Unified error return block
|
| 491 |
return (
|
| 492 |
-
None, #
|
| 493 |
gr.update(value="", visible=False), # Hide SRT download link
|
| 494 |
gr.update(value="", visible=False), # Hide Audio download link
|
| 495 |
-
gr.update(value=
|
| 496 |
)
|
| 497 |
|
| 498 |
-
#
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 515 |
"Maisie": "en-GB-MaisieNeural",
|
| 516 |
"Ryan": "en-GB-RyanNeural",
|
| 517 |
"Sonia": "en-GB-SoniaNeural",
|
|
@@ -535,15 +566,545 @@ voice_options = {
|
|
| 535 |
"Imani": "en-TZ-ImaniNeural",
|
| 536 |
"Leah": "en-ZA-LeahNeural",
|
| 537 |
"Luke": "en-ZA-LukeNeural"
|
| 538 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 539 |
}
|
| 540 |
|
| 541 |
-
#
|
| 542 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 543 |
atexit.register(file_manager.cleanup_all)
|
| 544 |
|
| 545 |
# Create Gradio interface
|
| 546 |
-
with gr.Blocks(title="Advanced TTS with Configurable SRT Generation"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 547 |
gr.Markdown("# Advanced TTS with Configurable SRT Generation")
|
| 548 |
gr.Markdown("Generate perfectly synchronized audio and subtitles with natural speech patterns.")
|
| 549 |
|
|
@@ -552,10 +1113,17 @@ with gr.Blocks(title="Advanced TTS with Configurable SRT Generation") as app:
|
|
| 552 |
text_input = gr.Textbox(label="Enter Text", lines=10, placeholder="Enter your text here...")
|
| 553 |
|
| 554 |
with gr.Column(scale=2):
|
| 555 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 556 |
label="Select Voice",
|
| 557 |
-
choices=list(
|
| 558 |
-
value=
|
|
|
|
| 559 |
)
|
| 560 |
pitch_slider = gr.Slider(
|
| 561 |
label="Pitch Adjustment (Hz)",
|
|
@@ -597,28 +1165,39 @@ with gr.Blocks(title="Advanced TTS with Configurable SRT Generation") as app:
|
|
| 597 |
value=True,
|
| 598 |
info="Process multiple segments simultaneously for faster conversion (recommended for longer texts)"
|
| 599 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 600 |
|
| 601 |
submit_btn = gr.Button("Generate Audio & Subtitles")
|
| 602 |
|
| 603 |
-
# Error message component - initially hidden
|
| 604 |
error_output = gr.Textbox(label="Status", visible=False, interactive=False)
|
| 605 |
|
| 606 |
with gr.Row():
|
| 607 |
with gr.Column():
|
| 608 |
audio_output = gr.Audio(label="Preview Audio")
|
| 609 |
with gr.Column():
|
| 610 |
-
#
|
| 611 |
-
srt_download_link = gr.
|
| 612 |
-
audio_download_link = gr.
|
| 613 |
-
|
| 614 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 615 |
submit_btn.click(
|
| 616 |
fn=process_text_with_progress,
|
| 617 |
inputs=[
|
| 618 |
text_input,
|
| 619 |
pitch_slider,
|
| 620 |
rate_slider,
|
| 621 |
-
|
| 622 |
words_per_line,
|
| 623 |
lines_per_segment,
|
| 624 |
parallel_processing
|
|
|
|
| 20 |
def get_timing(self, duration):
|
| 21 |
start_time = self.current_time
|
| 22 |
end_time = start_time + duration
|
| 23 |
+
self.current_time = end_time + self.segment_gap
|
| 24 |
return start_time, end_time
|
| 25 |
|
| 26 |
def get_audio_length(audio_file):
|
|
|
|
| 184 |
|
| 185 |
return lines
|
| 186 |
|
|
|
|
| 187 |
class TTSError(Exception):
|
| 188 |
"""Custom exception for TTS processing errors"""
|
| 189 |
pass
|
|
|
|
| 225 |
except Exception:
|
| 226 |
pass # Ignore deletion errors
|
| 227 |
|
|
|
|
| 228 |
class FileManager:
|
| 229 |
"""Manages temporary and output files with cleanup capabilities"""
|
| 230 |
def __init__(self):
|
|
|
|
| 279 |
except Exception:
|
| 280 |
pass # Ignore if directory isn't empty or can't be removed
|
| 281 |
|
|
|
|
| 282 |
file_manager = FileManager()
|
| 283 |
|
|
|
|
| 284 |
async def generate_accurate_srt(
|
| 285 |
text: str,
|
| 286 |
voice: str,
|
|
|
|
| 299 |
total_segments = len(segments)
|
| 300 |
processed_segments = []
|
| 301 |
|
|
|
|
| 302 |
if progress_callback:
|
| 303 |
progress_callback(0.1, "Text segmentation complete")
|
| 304 |
|
| 305 |
if parallel and total_segments > 1:
|
|
|
|
| 306 |
processed_count = 0
|
| 307 |
segment_tasks = []
|
|
|
|
|
|
|
| 308 |
semaphore = asyncio.Semaphore(max_workers)
|
| 309 |
|
| 310 |
async def process_with_semaphore(segment):
|
|
|
|
| 318 |
progress_callback(progress, f"Processed {processed_count}/{total_segments} segments")
|
| 319 |
return result
|
| 320 |
except Exception as e:
|
|
|
|
| 321 |
processed_count += 1
|
| 322 |
if progress_callback:
|
| 323 |
progress = 0.1 + (0.8 * processed_count / total_segments)
|
| 324 |
progress_callback(progress, f"Error in segment {segment.id}: {str(e)}")
|
| 325 |
raise
|
| 326 |
|
|
|
|
| 327 |
for segment in segments:
|
| 328 |
segment_tasks.append(process_with_semaphore(segment))
|
| 329 |
|
|
|
|
| 330 |
try:
|
| 331 |
processed_segments = await asyncio.gather(*segment_tasks)
|
| 332 |
except Exception as e:
|
|
|
|
| 334 |
progress_callback(0.9, f"Error during parallel processing: {str(e)}")
|
| 335 |
raise TTSError(f"Failed during parallel processing: {str(e)}")
|
| 336 |
else:
|
|
|
|
| 337 |
for i, segment in enumerate(segments):
|
| 338 |
try:
|
| 339 |
processed_segment = await process_segment_with_timing(segment, voice, rate, pitch)
|
|
|
|
| 347 |
progress_callback(0.9, f"Error processing segment {segment.id}: {str(e)}")
|
| 348 |
raise TTSError(f"Failed to process segment {segment.id}: {str(e)}")
|
| 349 |
|
|
|
|
| 350 |
processed_segments.sort(key=lambda s: s.id)
|
| 351 |
|
| 352 |
if progress_callback:
|
| 353 |
progress_callback(0.9, "Finalizing audio and subtitles")
|
| 354 |
|
|
|
|
| 355 |
current_time = 0
|
| 356 |
final_audio = AudioSegment.empty()
|
| 357 |
srt_content = ""
|
| 358 |
|
| 359 |
for segment in processed_segments:
|
|
|
|
| 360 |
segment.start_time = current_time
|
| 361 |
segment.end_time = current_time + segment.duration
|
| 362 |
|
|
|
|
| 363 |
srt_content += (
|
| 364 |
f"{segment.id}\n"
|
| 365 |
f"{format_time_ms(segment.start_time)} --> {format_time_ms(segment.end_time)}\n"
|
| 366 |
f"{segment.text}\n\n"
|
| 367 |
)
|
| 368 |
|
|
|
|
| 369 |
final_audio = final_audio.append(segment.audio, crossfade=0)
|
|
|
|
|
|
|
| 370 |
current_time = segment.end_time
|
| 371 |
|
|
|
|
| 372 |
srt_path, audio_path = file_manager.create_output_paths()
|
| 373 |
|
| 374 |
try:
|
|
|
|
| 375 |
export_params = {
|
| 376 |
'format': 'mp3',
|
| 377 |
+
'bitrate': '192k',
|
| 378 |
'parameters': [
|
| 379 |
+
'-ar', '44100',
|
| 380 |
+
'-ac', '2',
|
| 381 |
+
'-compression_level', '0',
|
| 382 |
+
'-qscale:a', '2'
|
| 383 |
]
|
| 384 |
}
|
| 385 |
final_audio.export(audio_path, **export_params)
|
|
|
|
| 396 |
|
| 397 |
return srt_path, audio_path
|
| 398 |
|
| 399 |
+
# This function is now correctly aligned to return types expected by the UI
|
| 400 |
async def process_text_with_progress(
|
| 401 |
text,
|
| 402 |
pitch,
|
|
|
|
| 407 |
parallel_processing,
|
| 408 |
progress=gr.Progress()
|
| 409 |
):
|
| 410 |
+
# Initialize outputs to their default 'hidden' state by providing empty strings
|
| 411 |
+
# and setting visible=False via gr.update.
|
| 412 |
+
output_audio = None # gr.Audio expects None or a path
|
| 413 |
+
output_srt_link_html = gr.update(value="", visible=False) # gr.HTML expects a string
|
| 414 |
+
output_audio_link_html = gr.update(value="", visible=False) # gr.HTML expects a string
|
| 415 |
+
output_error_message = gr.update(value="", visible=False) # gr.Textbox expects a string
|
|
|
|
| 416 |
|
| 417 |
# Input validation
|
| 418 |
if not text or text.strip() == "":
|
|
|
|
|
|
|
| 419 |
return (
|
| 420 |
output_audio,
|
| 421 |
+
output_srt_link_html,
|
| 422 |
+
output_audio_link_html,
|
| 423 |
+
gr.update(value="Please enter some text to convert to speech.", visible=True)
|
| 424 |
)
|
| 425 |
|
|
|
|
| 426 |
pitch_str = f"{pitch:+d}Hz" if pitch != 0 else "+0Hz"
|
| 427 |
rate_str = f"{rate:+d}%" if rate != 0 else "+0%"
|
| 428 |
|
| 429 |
try:
|
|
|
|
| 430 |
progress(0, "Preparing text...")
|
| 431 |
|
| 432 |
def update_progress(value, status):
|
|
|
|
| 443 |
parallel=parallel_processing
|
| 444 |
)
|
| 445 |
|
| 446 |
+
# Create HTML strings for download links. Gradio serves files using "file=" prefix.
|
| 447 |
+
srt_download_html = f"""
|
| 448 |
+
<a href="file={srt_path}" download="subtitles.srt" target="_blank"
|
| 449 |
+
style="display: inline-block; padding: 10px 20px; background: linear-gradient(135deg, #4776E6, #8E54E9); color: white; text-decoration: none; border-radius: 8px; font-weight: 600; transition: all 0.3s ease;"
|
| 450 |
+
onmouseover="this.style.transform='translateY(-2px)'; this.style.boxShadow='0 5px 15px rgba(71, 118, 230, 0.3)';"
|
| 451 |
+
onmouseout="this.style.transform='translateY(0)'; this.style.boxShadow='none';">
|
| 452 |
+
Download SRT File
|
| 453 |
+
</a>
|
| 454 |
+
"""
|
| 455 |
+
audio_download_html = f"""
|
| 456 |
+
<a href="file={audio_path}" download="audio.mp3" target="_blank"
|
| 457 |
+
style="display: inline-block; padding: 10px 20px; background: linear-gradient(135deg, #4776E6, #8E54E9); color: white; text-decoration: none; border-radius: 8px; font-weight: 600; transition: all 0.3s ease;"
|
| 458 |
+
onmouseover="this.style.transform='translateY(-2px)'; this.style.boxShadow='0 5px 15px rgba(71, 118, 230, 0.3)';"
|
| 459 |
+
onmouseout="this.style.transform='translateY(0)'; this.style.boxShadow='none';">
|
| 460 |
+
Download Audio File
|
| 461 |
+
</a>
|
| 462 |
+
"""
|
| 463 |
+
|
| 464 |
return (
|
| 465 |
+
audio_path, # Path for gr.Audio preview
|
| 466 |
+
gr.update(value=srt_download_html, visible=True), # HTML link for SRT download
|
| 467 |
+
gr.update(value=audio_download_html, visible=True), # HTML link for Audio download
|
| 468 |
+
gr.update(value="", visible=False) # Hide error message
|
| 469 |
)
|
| 470 |
except TTSError as e:
|
| 471 |
+
error_message = f"TTS Error: {str(e)}"
|
| 472 |
except Exception as e:
|
| 473 |
+
error_message = f"Unexpected error: {str(e)}"
|
| 474 |
|
|
|
|
| 475 |
return (
|
| 476 |
+
None, # Clear audio output on error
|
| 477 |
gr.update(value="", visible=False), # Hide SRT download link
|
| 478 |
gr.update(value="", visible=False), # Hide Audio download link
|
| 479 |
+
gr.update(value=error_message, visible=True) # Show error message
|
| 480 |
)
|
| 481 |
|
| 482 |
+
# This function is not used in the final version of the code, but kept for context from your example.
|
| 483 |
+
def create_download_link(audio_path):
|
| 484 |
+
if audio_path is None:
|
| 485 |
+
return None
|
| 486 |
+
|
| 487 |
+
filename = Path(audio_path).name
|
| 488 |
+
# Gradio handles file serving with "file=" prefix directly, no need for base_url
|
| 489 |
+
file_url = f"file={audio_path}"
|
| 490 |
+
|
| 491 |
+
return f"""
|
| 492 |
+
<a href="{file_url}"
|
| 493 |
+
download="{filename}"
|
| 494 |
+
target="_blank"
|
| 495 |
+
rel="noopener noreferrer"
|
| 496 |
+
style="display: inline-block; padding: 10px 20px; background: linear-gradient(135deg, #4776E6, #8E54E9); color: white; text-decoration: none; border-radius: 8px; font-weight: 600; transition: all 0.3s ease;"
|
| 497 |
+
onmouseover="this.style.transform='translateY(-2px)'; this.style.boxShadow='0 5px 15px rgba(71, 118, 230, 0.3)';"
|
| 498 |
+
onmouseout="this.style.transform='translateY(0)'; this.style.boxShadow='none';"
|
| 499 |
+
onclick="event.preventDefault(); fetch(this.href).then(resp => resp.blob()).then(blob => {{
|
| 500 |
+
const url = window.URL.createObjectURL(blob);
|
| 501 |
+
const a = document.createElement('a');
|
| 502 |
+
a.style.display = 'none';
|
| 503 |
+
a.href = url;
|
| 504 |
+
a.download = '{filename}';
|
| 505 |
+
document.body.appendChild(a);
|
| 506 |
+
a.click();
|
| 507 |
+
window.URL.revokeObjectURL(url);
|
| 508 |
+
document.body.removeChild(a);
|
| 509 |
+
}});">
|
| 510 |
+
Download Audio File
|
| 511 |
+
</a>
|
| 512 |
+
"""
|
| 513 |
+
|
| 514 |
+
def cleanup_file(filepath, delay=300):
|
| 515 |
+
def delete_file():
|
| 516 |
+
try:
|
| 517 |
+
if os.path.exists(filepath):
|
| 518 |
+
os.remove(filepath)
|
| 519 |
+
print(f"Cleaned up file: {filepath}")
|
| 520 |
+
except Exception as e:
|
| 521 |
+
print(f"Error cleaning up file {filepath}: {e}")
|
| 522 |
+
|
| 523 |
+
Timer(delay, delete_file).start()
|
| 524 |
+
|
| 525 |
+
# --- Voice Options and Gradio Interface ---
|
| 526 |
+
language_dict = {
|
| 527 |
+
"Hindi": {
|
| 528 |
+
"Madhur": "hi-IN-MadhurNeural",
|
| 529 |
+
"Swara": "hi-IN-SwaraNeural"
|
| 530 |
+
},
|
| 531 |
+
"English": {
|
| 532 |
+
"Jenny": "en-US-JennyNeural",
|
| 533 |
+
"Guy": "en-US-GuyNeural",
|
| 534 |
+
"Ana": "en-US-AnaNeural",
|
| 535 |
+
"Aria": "en-US-AriaNeural",
|
| 536 |
+
"Brian": "en-US-BrianNeural",
|
| 537 |
+
"Christopher": "en-US-ChristopherNeural",
|
| 538 |
+
"Eric": "en-US-EricNeural",
|
| 539 |
+
"Michelle": "en-US-MichelleNeural",
|
| 540 |
+
"Roger": "en-US-RogerNeural",
|
| 541 |
+
"Natasha": "en-AU-NatashaNeural",
|
| 542 |
+
"William": "en-AU-WilliamNeural",
|
| 543 |
+
"Clara": "en-CA-ClaraNeural",
|
| 544 |
+
"Liam": "en-CA-LiamNeural",
|
| 545 |
+
"Libby": "en-GB-LibbyNeural",
|
| 546 |
"Maisie": "en-GB-MaisieNeural",
|
| 547 |
"Ryan": "en-GB-RyanNeural",
|
| 548 |
"Sonia": "en-GB-SoniaNeural",
|
|
|
|
| 566 |
"Imani": "en-TZ-ImaniNeural",
|
| 567 |
"Leah": "en-ZA-LeahNeural",
|
| 568 |
"Luke": "en-ZA-LukeNeural"
|
| 569 |
+
},
|
| 570 |
+
"Spanish": {
|
| 571 |
+
"Elena": "es-AR-ElenaNeural",
|
| 572 |
+
"Tomas": "es-AR-TomasNeural",
|
| 573 |
+
"Marcelo": "es-BO-MarceloNeural",
|
| 574 |
+
"Sofia": "es-BO-SofiaNeural",
|
| 575 |
+
"Gonzalo": "es-CO-GonzaloNeural",
|
| 576 |
+
"Salome": "es-CO-SalomeNeural",
|
| 577 |
+
"Juan": "es-CR-JuanNeural",
|
| 578 |
+
"Maria": "es-CR-MariaNeural",
|
| 579 |
+
"Belkys": "es-CU-BelkysNeural",
|
| 580 |
+
"Emilio": "es-DO-EmilioNeural",
|
| 581 |
+
"Ramona": "es-DO-RamonaNeural",
|
| 582 |
+
"Andrea": "es-EC-AndreaNeural",
|
| 583 |
+
"Luis": "es-EC-LuisNeural",
|
| 584 |
+
"Alvaro": "es-ES-AlvaroNeural",
|
| 585 |
+
"Elvira": "es-ES-ElviraNeural",
|
| 586 |
+
"Teresa": "es-GQ-TeresaNeural",
|
| 587 |
+
"Andres": "es-GT-AndresNeural",
|
| 588 |
+
"Marta": "es-GT-MartaNeural",
|
| 589 |
+
"Carlos": "es-HN-CarlosNeural",
|
| 590 |
+
"Karla": "es-HN-KarlaNeural",
|
| 591 |
+
"Federico": "es-NI-FedericoNeural",
|
| 592 |
+
"Yolanda": "es-NI-YolandaNeural",
|
| 593 |
+
"Margarita": "es-PA-MargaritaNeural",
|
| 594 |
+
"Roberto": "es-PA-RobertoNeural",
|
| 595 |
+
"Alex": "es-PE-AlexNeural",
|
| 596 |
+
"Camila": "es-PE-CamilaNeural",
|
| 597 |
+
"Karina": "es-PR-KarinaNeural",
|
| 598 |
+
"Victor": "es-PR-VictorNeural",
|
| 599 |
+
"Mario": "es-PY-MarioNeural",
|
| 600 |
+
"Tania": "es-PY-TaniaNeural",
|
| 601 |
+
"Lorena": "es-SV-LorenaNeural",
|
| 602 |
+
"Rodrigo": "es-SV-RodrigoNeural",
|
| 603 |
+
"Alonso": "es-US-AlonsoNeural",
|
| 604 |
+
"Paloma": "es-US-PalomaNeural",
|
| 605 |
+
"Mateo": "es-UY-MateoNeural",
|
| 606 |
+
"Valentina": "es-UY-ValentinaNeural",
|
| 607 |
+
"Paola": "es-VE-PaolaNeural",
|
| 608 |
+
"Sebastian": "es-VE-SebastianNeural"
|
| 609 |
+
},
|
| 610 |
+
"Arabic": {
|
| 611 |
+
"Hamed": "ar-SA-HamedNeural",
|
| 612 |
+
"Zariyah": "ar-SA-ZariyahNeural",
|
| 613 |
+
"Fatima": "ar-AE-FatimaNeural",
|
| 614 |
+
"Hamdan": "ar-AE-HamdanNeural",
|
| 615 |
+
"Ali": "ar-BH-AliNeural",
|
| 616 |
+
"Laila": "ar-BH-LailaNeural",
|
| 617 |
+
"Ismael": "ar-DZ-IsmaelNeural",
|
| 618 |
+
"Salma": "ar-EG-SalmaNeural",
|
| 619 |
+
"Shakir": "ar-EG-ShakirNeural",
|
| 620 |
+
"Bassel": "ar-IQ-BasselNeural",
|
| 621 |
+
"Rana": "ar-IQ-RanaNeural",
|
| 622 |
+
"Sana": "ar-JO-SanaNeural",
|
| 623 |
+
"Taim": "ar-JO-TaimNeural",
|
| 624 |
+
"Fahed": "ar-KW-FahedNeural",
|
| 625 |
+
"Noura": "ar-KW-NouraNeural",
|
| 626 |
+
"Layla": "ar-LB-LaylaNeural",
|
| 627 |
+
"Rami": "ar-LB-RamiNeural",
|
| 628 |
+
"Iman": "ar-LY-ImanNeural",
|
| 629 |
+
"Omar": "ar-LY-OmarNeural",
|
| 630 |
+
"Jamal": "ar-MA-JamalNeural",
|
| 631 |
+
"Mouna": "ar-MA-MounaNeural",
|
| 632 |
+
"Abdullah": "ar-OM-AbdullahNeural",
|
| 633 |
+
"Aysha": "ar-OM-AyshaNeural",
|
| 634 |
+
"Amal": "ar-QA-AmalNeural",
|
| 635 |
+
"Moaz": "ar-QA-MoazNeural",
|
| 636 |
+
"Amany": "ar-SY-AmanyNeural",
|
| 637 |
+
"Laith": "ar-SY-LaithNeural",
|
| 638 |
+
"Hedi": "ar-TN-HediNeural",
|
| 639 |
+
"Reem": "ar-TN-ReemNeural",
|
| 640 |
+
"Maryam": "ar-YE-MaryamNeural",
|
| 641 |
+
"Saleh": "ar-YE-SalehNeural"
|
| 642 |
+
},
|
| 643 |
+
"Korean": {
|
| 644 |
+
"Sun-Hi": "ko-KR-SunHiNeural",
|
| 645 |
+
"InJoon": "ko-KR-InJoonNeural"
|
| 646 |
+
},
|
| 647 |
+
"Thai": {
|
| 648 |
+
"Premwadee": "th-TH-PremwadeeNeural",
|
| 649 |
+
"Niwat": "th-TH-NiwatNeural"
|
| 650 |
+
},
|
| 651 |
+
"Vietnamese": {
|
| 652 |
+
"HoaiMy": "vi-VN-HoaiMyNeural",
|
| 653 |
+
"NamMinh": "vi-VN-NamMinhNeural"
|
| 654 |
+
},
|
| 655 |
+
"Japanese": {
|
| 656 |
+
"Nanami": "ja-JP-NanamiNeural",
|
| 657 |
+
"Keita": "ja-JP-KeitaNeural"
|
| 658 |
+
},
|
| 659 |
+
"French": {
|
| 660 |
+
"Denise": "fr-FR-DeniseNeural",
|
| 661 |
+
"Eloise": "fr-FR-EloiseNeural",
|
| 662 |
+
"Henri": "fr-FR-HenriNeural",
|
| 663 |
+
"Sylvie": "fr-CA-SylvieNeural",
|
| 664 |
+
"Antoine": "fr-CA-AntoineNeural",
|
| 665 |
+
"Jean": "fr-CA-JeanNeural",
|
| 666 |
+
"Ariane": "fr-CH-ArianeNeural",
|
| 667 |
+
"Fabrice": "fr-CH-FabriceNeural",
|
| 668 |
+
"Charline": "fr-BE-CharlineNeural",
|
| 669 |
+
"Gerard": "fr-BE-GerardNeural"
|
| 670 |
+
},
|
| 671 |
+
"Portuguese": {
|
| 672 |
+
"Francisca": "pt-BR-FranciscaNeural",
|
| 673 |
+
"Antonio": "pt-BR-AntonioNeural",
|
| 674 |
+
"Duarte": "pt-PT-DuarteNeural",
|
| 675 |
+
"Raquel": "pt-PT-RaquelNeural"
|
| 676 |
+
},
|
| 677 |
+
"Indonesian": {
|
| 678 |
+
"Ardi": "id-ID-ArdiNeural",
|
| 679 |
+
"Gadis": "id-ID-GadisNeural"
|
| 680 |
+
},
|
| 681 |
+
"Hebrew": {
|
| 682 |
+
"Avri": "he-IL-AvriNeural",
|
| 683 |
+
"Hila": "he-IL-HilaNeural"
|
| 684 |
+
},
|
| 685 |
+
"Italian": {
|
| 686 |
+
"Isabella": "it-IT-IsabellaNeural",
|
| 687 |
+
"Diego": "it-IT-DiegoNeural",
|
| 688 |
+
"Elsa": "it-IT-ElsaNeural"
|
| 689 |
+
},
|
| 690 |
+
"Dutch": {
|
| 691 |
+
"Colette": "nl-NL-ColetteNeural",
|
| 692 |
+
"Fenna": "nl-NL-FennaNeural",
|
| 693 |
+
"Maarten": "nl-NL-MaartenNeural",
|
| 694 |
+
"Arnaud": "nl-BE-ArnaudNeural",
|
| 695 |
+
"Dena": "nl-BE-DenaNeural"
|
| 696 |
+
},
|
| 697 |
+
"Malay": {
|
| 698 |
+
"Osman": "ms-MY-OsmanNeural",
|
| 699 |
+
"Yasmin": "ms-MY-YasminNeural"
|
| 700 |
+
},
|
| 701 |
+
"Norwegian": {
|
| 702 |
+
"Pernille": "nb-NO-PernilleNeural",
|
| 703 |
+
"Finn": "nb-NO-FinnNeural"
|
| 704 |
+
},
|
| 705 |
+
"Swedish": {
|
| 706 |
+
"Sofie": "sv-SE-SofieNeural",
|
| 707 |
+
"Mattias": "sv-SE-MattiasNeural"
|
| 708 |
+
},
|
| 709 |
+
"Greek": {
|
| 710 |
+
"Athina": "el-GR-AthinaNeural",
|
| 711 |
+
"Nestoras": "el-GR-NestorasNeural"
|
| 712 |
+
},
|
| 713 |
+
"German": {
|
| 714 |
+
"Katja": "de-DE-KatjaNeural",
|
| 715 |
+
"Amala": "de-DE-AmalaNeural",
|
| 716 |
+
"Conrad": "de-DE-ConradNeural",
|
| 717 |
+
"Killian": "de-DE-KillianNeural",
|
| 718 |
+
"Ingrid": "de-AT-IngridNeural",
|
| 719 |
+
"Jonas": "de-AT-JonasNeural",
|
| 720 |
+
"Jan": "de-CH-JanNeural",
|
| 721 |
+
"Leni": "de-CH-LeniNeural"
|
| 722 |
+
},
|
| 723 |
+
"Afrikaans": {
|
| 724 |
+
"Adri": "af-ZA-AdriNeural",
|
| 725 |
+
"Willem": "af-ZA-WillemNeural"
|
| 726 |
+
},
|
| 727 |
+
"Amharic": {
|
| 728 |
+
"Ameha": "am-ET-AmehaNeural",
|
| 729 |
+
"Mekdes": "am-ET-MekdesNeural"
|
| 730 |
+
},
|
| 731 |
+
"Azerbaijani": {
|
| 732 |
+
"Babek": "az-AZ-BabekNeural",
|
| 733 |
+
"Banu": "az-AZ-BanuNeural"
|
| 734 |
+
},
|
| 735 |
+
"Bulgarian": {
|
| 736 |
+
"Borislav": "bg-BG-BorislavNeural",
|
| 737 |
+
"Kalina": "bg-BG-KalinaNeural"
|
| 738 |
+
},
|
| 739 |
+
"Bengali": {
|
| 740 |
+
"Nabanita": "bn-BD-NabanitaNeural",
|
| 741 |
+
"Pradeep": "bn-BD-PradeepNeural",
|
| 742 |
+
"Bashkar": "bn-IN-BashkarNeural",
|
| 743 |
+
"Tanishaa": "bn-IN-TanishaaNeural"
|
| 744 |
+
},
|
| 745 |
+
"Bosnian": {
|
| 746 |
+
"Goran": "bs-BA-GoranNeural",
|
| 747 |
+
"Vesna": "bs-BA-VesnaNeural"
|
| 748 |
+
},
|
| 749 |
+
"Catalan": {
|
| 750 |
+
"Joana": "ca-ES-JoanaNeural",
|
| 751 |
+
"Enric": "ca-ES-EnricNeural"
|
| 752 |
+
},
|
| 753 |
+
"Czech": {
|
| 754 |
+
"Antonin": "cs-CZ-AntoninNeural",
|
| 755 |
+
"Vlasta": "cs-CZ-VlastaNeural"
|
| 756 |
+
},
|
| 757 |
+
"Welsh": {
|
| 758 |
+
"Aled": "cy-GB-AledNeural",
|
| 759 |
+
"Nia": "cy-GB-NiaNeural"
|
| 760 |
+
},
|
| 761 |
+
"Danish": {
|
| 762 |
+
"Christel": "da-DK-ChristelNeural",
|
| 763 |
+
"Jeppe": "da-DK-JeppeNeural"
|
| 764 |
+
},
|
| 765 |
+
"Estonian": {
|
| 766 |
+
"Anu": "et-EE-AnuNeural",
|
| 767 |
+
"Kert": "et-EE-KertNeural"
|
| 768 |
+
},
|
| 769 |
+
"Persian": {
|
| 770 |
+
"Dilara": "fa-IR-DilaraNeural",
|
| 771 |
+
"Farid": "fa-IR-FaridNeural"
|
| 772 |
+
},
|
| 773 |
+
"Finnish": {
|
| 774 |
+
"Harri": "fi-FI-HarriNeural",
|
| 775 |
+
"Noora": "fi-FI-NooraNeural"
|
| 776 |
+
},
|
| 777 |
+
"Irish": {
|
| 778 |
+
"Colm": "ga-IE-ColmNeural",
|
| 779 |
+
"Orla": "ga-IE-OrlaNeural"
|
| 780 |
+
},
|
| 781 |
+
"Galician": {
|
| 782 |
+
"Roi": "gl-ES-RoiNeural",
|
| 783 |
+
"Sabela": "gl-ES-SabelaNeural"
|
| 784 |
+
},
|
| 785 |
+
"Gujarati": {
|
| 786 |
+
"Dhwani": "gu-IN-DhwaniNeural",
|
| 787 |
+
"Niranjan": "gu-IN-NiranjanNeural"
|
| 788 |
+
},
|
| 789 |
+
"Croatian": {
|
| 790 |
+
"Gabrijela": "hr-HR-GabrijelaNeural",
|
| 791 |
+
"Srecko": "hr-HR-SreckoNeural"
|
| 792 |
+
},
|
| 793 |
+
"Hungarian": {
|
| 794 |
+
"Noemi": "hu-HU-NoemiNeural",
|
| 795 |
+
"Tamas": "hu-HU-TamasNeural"
|
| 796 |
+
},
|
| 797 |
+
"Icelandic": {
|
| 798 |
+
"Gudrun": "is-IS-GudrunNeural",
|
| 799 |
+
"Gunnar": "is-IS-GunnarNeural"
|
| 800 |
+
},
|
| 801 |
+
"Javanese": {
|
| 802 |
+
"Dimas": "jv-ID-DimasNeural",
|
| 803 |
+
"Siti": "jv-ID-SitiNeural"
|
| 804 |
+
},
|
| 805 |
+
"Georgian": {
|
| 806 |
+
"Eka": "ka-GE-EkaNeural",
|
| 807 |
+
"Giorgi": "ka-GE-GiorgiNeural"
|
| 808 |
+
},
|
| 809 |
+
"Kazakh": {
|
| 810 |
+
"Aigul": "kk-KZ-AigulNeural",
|
| 811 |
+
"Daulet": "kk-KZ-DauletNeural"
|
| 812 |
+
},
|
| 813 |
+
"Khmer": {
|
| 814 |
+
"Piseth": "km-KH-PisethNeural",
|
| 815 |
+
"Sreymom": "km-KH-SreymomNeural"
|
| 816 |
+
},
|
| 817 |
+
"Kannada": {
|
| 818 |
+
"Gagan": "kn-IN-GaganNeural",
|
| 819 |
+
"Sapna": "kn-IN-SapnaNeural"
|
| 820 |
+
},
|
| 821 |
+
"Lao": {
|
| 822 |
+
"Chanthavong": "lo-LA-ChanthavongNeural",
|
| 823 |
+
"Keomany": "lo-LA-KeomanyNeural"
|
| 824 |
+
},
|
| 825 |
+
"Lithuanian": {
|
| 826 |
+
"Leonas": "lt-LT-LeonasNeural",
|
| 827 |
+
"Ona": "lt-LT-OnaNeural"
|
| 828 |
+
},
|
| 829 |
+
"Latvian": {
|
| 830 |
+
"Everita": "lv-LV-EveritaNeural",
|
| 831 |
+
"Nils": "lv-LV-NilsNeural"
|
| 832 |
+
},
|
| 833 |
+
"Macedonian": {
|
| 834 |
+
"Aleksandar": "mk-MK-AleksandarNeural",
|
| 835 |
+
"Marija": "mk-MK-MarijaNeural"
|
| 836 |
+
},
|
| 837 |
+
"Malayalam": {
|
| 838 |
+
"Midhun": "ml-IN-MidhunNeural",
|
| 839 |
+
"Sobhana": "ml-IN-SobhanaNeural"
|
| 840 |
+
},
|
| 841 |
+
"Mongolian": {
|
| 842 |
+
"Bataa": "mn-MN-BataaNeural",
|
| 843 |
+
"Yesui": "mn-MN-YesuiNeural"
|
| 844 |
+
},
|
| 845 |
+
"Marathi": {
|
| 846 |
+
"Aarohi": "mr-IN-AarohiNeural",
|
| 847 |
+
"Manohar": "mr-IN-ManoharNeural"
|
| 848 |
+
},
|
| 849 |
+
"Maltese": {
|
| 850 |
+
"Grace": "mt-MT-GraceNeural",
|
| 851 |
+
"Joseph": "mt-MT-JosephNeural"
|
| 852 |
+
},
|
| 853 |
+
"Burmese": {
|
| 854 |
+
"Nilar": "my-MM-NilarNeural",
|
| 855 |
+
"Thiha": "my-MM-ThihaNeural"
|
| 856 |
+
},
|
| 857 |
+
"Nepali": {
|
| 858 |
+
"Hemkala": "ne-NP-HemkalaNeural",
|
| 859 |
+
"Sagar": "ne-NP-SagarNeural"
|
| 860 |
+
},
|
| 861 |
+
"Polish": {
|
| 862 |
+
"Marek": "pl-PL-MarekNeural",
|
| 863 |
+
"Zofia": "pl-PL-ZofiaNeural"
|
| 864 |
+
},
|
| 865 |
+
"Pashto": {
|
| 866 |
+
"Gul Nawaz": "ps-AF-GulNawazNeural",
|
| 867 |
+
"Latifa": "ps-AF-LatifaNeural"
|
| 868 |
+
},
|
| 869 |
+
"Romanian": {
|
| 870 |
+
"Alina": "ro-RO-AlinaNeural",
|
| 871 |
+
"Emil": "ro-RO-EmilNeural"
|
| 872 |
+
},
|
| 873 |
+
"Russian": {
|
| 874 |
+
"Svetlana": "ru-RU-SvetlanaNeural",
|
| 875 |
+
"Dmitry": "ru-RU-DmitryNeural"
|
| 876 |
+
},
|
| 877 |
+
"Sinhala": {
|
| 878 |
+
"Sameera": "si-LK-SameeraNeural",
|
| 879 |
+
"Thilini": "si-LK-ThiliniNeural"
|
| 880 |
+
},
|
| 881 |
+
"Slovak": {
|
| 882 |
+
"Lukas": "sk-SK-LukasNeural",
|
| 883 |
+
"Viktoria": "sk-SK-ViktoriaNeural"
|
| 884 |
+
},
|
| 885 |
+
"Slovenian": {
|
| 886 |
+
"Petra": "sl-SI-PetraNeural",
|
| 887 |
+
"Rok": "sl-SI-RokNeural"
|
| 888 |
+
},
|
| 889 |
+
"Somali": {
|
| 890 |
+
"Muuse": "so-SO-MuuseNeural",
|
| 891 |
+
"Ubax": "so-SO-UbaxNeural"
|
| 892 |
+
},
|
| 893 |
+
"Albanian": {
|
| 894 |
+
"Anila": "sq-AL-AnilaNeural",
|
| 895 |
+
"Ilir": "sq-AL-IlirNeural"
|
| 896 |
+
},
|
| 897 |
+
"Serbian": {
|
| 898 |
+
"Nicholas": "sr-RS-NicholasNeural",
|
| 899 |
+
"Sophie": "sr-RS-SophieNeural"
|
| 900 |
+
},
|
| 901 |
+
"Sundanese": {
|
| 902 |
+
"Jajang": "su-ID-JajangNeural",
|
| 903 |
+
"Tuti": "su-ID-TutiNeural"
|
| 904 |
+
},
|
| 905 |
+
"Swahili": {
|
| 906 |
+
"Rafiki": "sw-KE-RafikiNeural",
|
| 907 |
+
"Zuri": "sw-KE-ZuriNeural",
|
| 908 |
+
"Daudi": "sw-TZ-DaudiNeural",
|
| 909 |
+
"Rehema": "sw-TZ-RehemaNeural"
|
| 910 |
+
},
|
| 911 |
+
"Tamil": {
|
| 912 |
+
"Pallavi": "ta-IN-PallaviNeural",
|
| 913 |
+
"Valluvar": "ta-IN-ValluvarNeural",
|
| 914 |
+
"Kumar": "ta-LK-KumarNeural",
|
| 915 |
+
"Saranya": "ta-LK-SaranyaNeural",
|
| 916 |
+
"Kani": "ta-MY-KaniNeural",
|
| 917 |
+
"Surya": "ta-MY-SuryaNeural",
|
| 918 |
+
"Anbu": "ta-SG-AnbuNeural"
|
| 919 |
+
},
|
| 920 |
+
"Telugu": {
|
| 921 |
+
"Mohan": "te-IN-MohanNeural",
|
| 922 |
+
"Shruti": "te-IN-ShrutiNeural"
|
| 923 |
+
},
|
| 924 |
+
"Turkish": {
|
| 925 |
+
"Ahmet": "tr-TR-AhmetNeural",
|
| 926 |
+
"Emel": "tr-TR-EmelNeural"
|
| 927 |
+
},
|
| 928 |
+
"Ukrainian": {
|
| 929 |
+
"Ostap": "uk-UA-OstapNeural",
|
| 930 |
+
"Polina": "uk-UA-PolinaNeural"
|
| 931 |
+
},
|
| 932 |
+
"Urdu": {
|
| 933 |
+
"Gul": "ur-IN-GulNeural",
|
| 934 |
+
"Salman": "ur-IN-SalmanNeural",
|
| 935 |
+
"Asad": "ur-PK-AsadNeural",
|
| 936 |
+
"Uzma": "ur-PK-UzmaNeural"
|
| 937 |
+
},
|
| 938 |
+
"Uzbek": {
|
| 939 |
+
"Madina": "uz-UZ-MadinaNeural",
|
| 940 |
+
"Sardor": "uz-UZ-SardorNeural"
|
| 941 |
+
},
|
| 942 |
+
"Mandarin": {
|
| 943 |
+
"Xiaoxiao": "zh-CN-XiaoxiaoNeural",
|
| 944 |
+
"Yunyang": "zh-CN-YunyangNeural",
|
| 945 |
+
"Yunxi": "zh-CN-YunxiNeural",
|
| 946 |
+
"Xiaoyi": "zh-CN-XiaoyiNeural",
|
| 947 |
+
"Yunjian": "zh-CN-YunjianNeural",
|
| 948 |
+
"Yunxia": "zh-CN-YunxiaNeural",
|
| 949 |
+
"Xiaobei": "zh-CN-liaoning-XiaobeiNeural",
|
| 950 |
+
"Xiaoni": "zh-CN-shaanxi-XiaoniNeural",
|
| 951 |
+
"HiuMaan": "zh-HK-HiuMaanNeural",
|
| 952 |
+
"HiuGaai": "zh-HK-HiuGaaiNeural",
|
| 953 |
+
"WanLung": "zh-HK-WanLungNeural",
|
| 954 |
+
"HsiaoChen": "zh-TW-HsiaoChenNeural",
|
| 955 |
+
"HsiaoYu": "zh-TW-HsiaoYuNeural",
|
| 956 |
+
"YunJhe": "zh-TW-YunJheNeural"
|
| 957 |
+
},
|
| 958 |
+
"Zulu": {
|
| 959 |
+
"Thando": "zu-ZA-ThandoNeural",
|
| 960 |
+
"Themba": "zu-ZA-ThembaNeural"
|
| 961 |
+
}
|
| 962 |
}
|
| 963 |
|
| 964 |
+
# Ensure these have initial values, even if temporary
|
| 965 |
+
default_language = "English"
|
| 966 |
+
default_speaker = language_dict[default_language][list(language_dict[default_language].keys())[0]] # Set to first English speaker
|
| 967 |
+
|
| 968 |
+
def get_speakers(language):
|
| 969 |
+
speakers = list(language_dict[language].keys())
|
| 970 |
+
# Return gr.update to set choices and selected value
|
| 971 |
+
return gr.update(choices=speakers, value=speakers[0], interactive=True), gr.Checkbox(visible=language == "Arabic", interactive=True)
|
| 972 |
+
|
| 973 |
atexit.register(file_manager.cleanup_all)
|
| 974 |
|
| 975 |
# Create Gradio interface
|
| 976 |
+
with gr.Blocks(title="Advanced TTS with Configurable SRT Generation",
|
| 977 |
+
css="""
|
| 978 |
+
:root {
|
| 979 |
+
--primary-color: #4776E6;
|
| 980 |
+
--secondary-color: #8E54E9;
|
| 981 |
+
--background-light: #ffffff;
|
| 982 |
+
--card-light: #f8f9fa;
|
| 983 |
+
--text-dark: #2d3436;
|
| 984 |
+
--text-gray: #636e72;
|
| 985 |
+
--border-color: #e0e0e0;
|
| 986 |
+
}
|
| 987 |
+
|
| 988 |
+
@media (max-width: 768px) {
|
| 989 |
+
.container {
|
| 990 |
+
padding: 10px !important;
|
| 991 |
+
}
|
| 992 |
+
.header h1 {
|
| 993 |
+
font-size: 1.5em !important;
|
| 994 |
+
}
|
| 995 |
+
}
|
| 996 |
+
|
| 997 |
+
body {
|
| 998 |
+
background-color: var(--background-light);
|
| 999 |
+
}
|
| 1000 |
+
|
| 1001 |
+
.container {
|
| 1002 |
+
background-color: var(--background-light);
|
| 1003 |
+
max-width: 1200px;
|
| 1004 |
+
margin: 0 auto;
|
| 1005 |
+
padding: 20px;
|
| 1006 |
+
}
|
| 1007 |
+
|
| 1008 |
+
.header {
|
| 1009 |
+
text-align: center;
|
| 1010 |
+
margin-bottom: 30px;
|
| 1011 |
+
background: linear-gradient(135deg, var(--primary-color), var(--secondary-color));
|
| 1012 |
+
padding: 25px;
|
| 1013 |
+
border-radius: 15px;
|
| 1014 |
+
color: white;
|
| 1015 |
+
box-shadow: 0 4px 15px rgba(71, 118, 230, 0.2);
|
| 1016 |
+
}
|
| 1017 |
+
|
| 1018 |
+
.input-section, .output-section {
|
| 1019 |
+
background-color: var(--card-light);
|
| 1020 |
+
padding: 25px;
|
| 1021 |
+
border-radius: 15px;
|
| 1022 |
+
margin-bottom: 20px;
|
| 1023 |
+
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.05);
|
| 1024 |
+
border: 1px solid var(--border-color);
|
| 1025 |
+
width: 100%;
|
| 1026 |
+
}
|
| 1027 |
+
|
| 1028 |
+
.input-box textarea {
|
| 1029 |
+
min-height: 120px !important;
|
| 1030 |
+
font-size: 16px !important;
|
| 1031 |
+
border: 1px solid var(--border-color) !important;
|
| 1032 |
+
border-radius: 10px !important;
|
| 1033 |
+
padding: 15px !important;
|
| 1034 |
+
width: 100% !important;
|
| 1035 |
+
}
|
| 1036 |
+
|
| 1037 |
+
.dropdown {
|
| 1038 |
+
width: 100% !important;
|
| 1039 |
+
}
|
| 1040 |
+
|
| 1041 |
+
select, input[type="text"] {
|
| 1042 |
+
width: 100% !important;
|
| 1043 |
+
padding: 12px !important;
|
| 1044 |
+
border-radius: 8px !important;
|
| 1045 |
+
border: 1px solid var(--border-color) !important;
|
| 1046 |
+
}
|
| 1047 |
+
|
| 1048 |
+
.generate-btn {
|
| 1049 |
+
background: linear-gradient(135deg, var(--primary-color), var(--secondary-color)) !important;
|
| 1050 |
+
padding: 15px 30px !important;
|
| 1051 |
+
border-radius: 10px !important;
|
| 1052 |
+
font-weight: 600 !important;
|
| 1053 |
+
letter-spacing: 0.5px !important;
|
| 1054 |
+
width: 100% !important;
|
| 1055 |
+
margin-top: 15px !important;
|
| 1056 |
+
}
|
| 1057 |
+
|
| 1058 |
+
.generate-btn:hover {
|
| 1059 |
+
transform: translateY(-2px);
|
| 1060 |
+
box-shadow: 0 5px 15px rgba(71, 118, 230, 0.3) !important;
|
| 1061 |
+
}
|
| 1062 |
+
|
| 1063 |
+
.download-btn {
|
| 1064 |
+
margin-top: 20px;
|
| 1065 |
+
text-align: center;
|
| 1066 |
+
}
|
| 1067 |
+
|
| 1068 |
+
.download-btn a {
|
| 1069 |
+
display: inline-flex;
|
| 1070 |
+
align-items: center;
|
| 1071 |
+
justify-content: center;
|
| 1072 |
+
background: linear-gradient(135deg, var(--primary-color), var(--secondary-color));
|
| 1073 |
+
color: white;
|
| 1074 |
+
padding: 12px 25px;
|
| 1075 |
+
border-radius: 10px;
|
| 1076 |
+
text-decoration: none;
|
| 1077 |
+
font-weight: 600;
|
| 1078 |
+
letter-spacing: 0.5px;
|
| 1079 |
+
transition: all 0.3s ease;
|
| 1080 |
+
gap: 8px;
|
| 1081 |
+
width: 100%;
|
| 1082 |
+
max-width: 300px;
|
| 1083 |
+
}
|
| 1084 |
+
|
| 1085 |
+
.download-btn a:before {
|
| 1086 |
+
content: "⬇️";
|
| 1087 |
+
font-size: 1.2em;
|
| 1088 |
+
}
|
| 1089 |
+
|
| 1090 |
+
.download-btn a:hover {
|
| 1091 |
+
transform: translateY(-2px);
|
| 1092 |
+
box-shadow: 0 5px 15px rgba(71, 118, 230, 0.3);
|
| 1093 |
+
}
|
| 1094 |
+
|
| 1095 |
+
/* Audio player styling */
|
| 1096 |
+
audio {
|
| 1097 |
+
width: 100% !important;
|
| 1098 |
+
margin: 15px 0 !important;
|
| 1099 |
+
border-radius: 10px !important;
|
| 1100 |
+
}
|
| 1101 |
+
|
| 1102 |
+
/* Hide output text - this CSS is from your original file, ensure it's intentional */
|
| 1103 |
+
#output-text {
|
| 1104 |
+
display: none !important;
|
| 1105 |
+
}
|
| 1106 |
+
"""
|
| 1107 |
+
) as app: # Changed demo to app for consistency
|
| 1108 |
gr.Markdown("# Advanced TTS with Configurable SRT Generation")
|
| 1109 |
gr.Markdown("Generate perfectly synchronized audio and subtitles with natural speech patterns.")
|
| 1110 |
|
|
|
|
| 1113 |
text_input = gr.Textbox(label="Enter Text", lines=10, placeholder="Enter your text here...")
|
| 1114 |
|
| 1115 |
with gr.Column(scale=2):
|
| 1116 |
+
language = gr.Dropdown( # Changed to language for consistency
|
| 1117 |
+
label="Select Language",
|
| 1118 |
+
choices=list(language_dict.keys()),
|
| 1119 |
+
value=default_language,
|
| 1120 |
+
interactive=True
|
| 1121 |
+
)
|
| 1122 |
+
speaker = gr.Dropdown( # Changed to speaker for consistency
|
| 1123 |
label="Select Voice",
|
| 1124 |
+
choices=list(language_dict[default_language].keys()), # Initialize with default language's speakers
|
| 1125 |
+
value=list(language_dict[default_language].keys())[0], # Default to first speaker of default language
|
| 1126 |
+
interactive=True # Should be interactive if it changes based on language
|
| 1127 |
)
|
| 1128 |
pitch_slider = gr.Slider(
|
| 1129 |
label="Pitch Adjustment (Hz)",
|
|
|
|
| 1165 |
value=True,
|
| 1166 |
info="Process multiple segments simultaneously for faster conversion (recommended for longer texts)"
|
| 1167 |
)
|
| 1168 |
+
tashkeel_checkbox = gr.Checkbox( # Moved here for better layout
|
| 1169 |
+
label="Tashkeel (Arabic Only)",
|
| 1170 |
+
value=False,
|
| 1171 |
+
visible=False, # Initially hidden
|
| 1172 |
+
interactive=True
|
| 1173 |
+
)
|
| 1174 |
|
| 1175 |
submit_btn = gr.Button("Generate Audio & Subtitles")
|
| 1176 |
|
|
|
|
| 1177 |
error_output = gr.Textbox(label="Status", visible=False, interactive=False)
|
| 1178 |
|
| 1179 |
with gr.Row():
|
| 1180 |
with gr.Column():
|
| 1181 |
audio_output = gr.Audio(label="Preview Audio")
|
| 1182 |
with gr.Column():
|
| 1183 |
+
# Use gr.HTML for download links
|
| 1184 |
+
srt_download_link = gr.HTML(value="", visible=False, label="Download SRT")
|
| 1185 |
+
audio_download_link = gr.HTML(value="", visible=False, label="Download Audio")
|
| 1186 |
+
|
| 1187 |
+
# Event Handlers
|
| 1188 |
+
language.change(
|
| 1189 |
+
fn=get_speakers,
|
| 1190 |
+
inputs=[language],
|
| 1191 |
+
outputs=[speaker, tashkeel_checkbox] # Ensure correct output for dropdown and checkbox
|
| 1192 |
+
)
|
| 1193 |
+
|
| 1194 |
submit_btn.click(
|
| 1195 |
fn=process_text_with_progress,
|
| 1196 |
inputs=[
|
| 1197 |
text_input,
|
| 1198 |
pitch_slider,
|
| 1199 |
rate_slider,
|
| 1200 |
+
speaker, # Use 'speaker' here as it holds the actual voice code
|
| 1201 |
words_per_line,
|
| 1202 |
lines_per_segment,
|
| 1203 |
parallel_processing
|