Spaces:
Build error
Build error
Commit ·
4f2415a
1
Parent(s): 16311fa
Add multi-speaker support and interface for text-to-speech conversion
Browse files
app.py
CHANGED
|
@@ -311,6 +311,217 @@ async def tts_interface(text, voice, rate, pitch, generate_subtitles, uploaded_f
|
|
| 311 |
return audio, subtitle, None
|
| 312 |
|
| 313 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
async def create_demo():
|
| 315 |
voices = await get_voices()
|
| 316 |
|
|
@@ -322,6 +533,7 @@ async def create_demo():
|
|
| 322 |
|
| 323 |
features = """
|
| 324 |
## ✨ Latest Features
|
|
|
|
| 325 |
- **SRT Subtitle Support**: Upload SRT files or input SRT format text to generate perfectly synchronized speech
|
| 326 |
- **SRT Generation**: Create subtitle files alongside your audio for perfect timing
|
| 327 |
- **File Upload**: Easily upload TXT or SRT files for conversion
|
|
@@ -333,72 +545,152 @@ async def create_demo():
|
|
| 333 |
gr.Markdown(description)
|
| 334 |
gr.Markdown(features)
|
| 335 |
|
| 336 |
-
with gr.
|
| 337 |
-
with gr.
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 352 |
)
|
| 353 |
-
|
| 354 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
)
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
00:00:00,112 --> 00:00:01,647
|
| 366 |
-
Hello how are you doing
|
| 367 |
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
outputs=outputs
|
| 399 |
-
)
|
| 400 |
|
| 401 |
-
gr.Markdown("Experience the power of Edge TTS for text-to-speech conversion
|
| 402 |
|
| 403 |
return demo
|
| 404 |
|
|
@@ -411,3 +703,4 @@ async def main():
|
|
| 411 |
|
| 412 |
if __name__ == "__main__":
|
| 413 |
asyncio.run(main())
|
|
|
|
|
|
| 311 |
return audio, subtitle, None
|
| 312 |
|
| 313 |
|
| 314 |
+
async def parse_multi_speaker_text(text):
|
| 315 |
+
"""Parse text containing speaker designations like 'Speaker1: Hello'"""
|
| 316 |
+
lines = text.split('\n')
|
| 317 |
+
speaker_segments = []
|
| 318 |
+
current_speaker = None
|
| 319 |
+
current_text = []
|
| 320 |
+
|
| 321 |
+
speaker_pattern = re.compile(r'^(Speaker\s*\d+|S\d+)\s*:\s*(.*)$', re.IGNORECASE)
|
| 322 |
+
|
| 323 |
+
for line in lines:
|
| 324 |
+
match = speaker_pattern.match(line.strip())
|
| 325 |
+
if match:
|
| 326 |
+
# If collecting text for a previous speaker, save it
|
| 327 |
+
if current_speaker and current_text:
|
| 328 |
+
speaker_segments.append({
|
| 329 |
+
'speaker': current_speaker,
|
| 330 |
+
'text': ' '.join(current_text).strip()
|
| 331 |
+
})
|
| 332 |
+
current_text = []
|
| 333 |
+
|
| 334 |
+
# Set the new current speaker and start collecting their text
|
| 335 |
+
current_speaker = match.group(1).strip()
|
| 336 |
+
if match.group(2).strip(): # If there's text after the speaker designation
|
| 337 |
+
current_text.append(match.group(2).strip())
|
| 338 |
+
elif line.strip() and current_speaker: # Continue with the current speaker
|
| 339 |
+
current_text.append(line.strip())
|
| 340 |
+
|
| 341 |
+
# Add the last speaker's text if any
|
| 342 |
+
if current_speaker and current_text:
|
| 343 |
+
speaker_segments.append({
|
| 344 |
+
'speaker': current_speaker,
|
| 345 |
+
'text': ' '.join(current_text).strip()
|
| 346 |
+
})
|
| 347 |
+
|
| 348 |
+
return speaker_segments
|
| 349 |
+
|
| 350 |
+
async def multi_speaker_tts(text, speaker_settings, generate_subtitles=False):
|
| 351 |
+
"""Process multi-speaker text and generate audio with different voices and settings"""
|
| 352 |
+
if not text.strip():
|
| 353 |
+
return None, None, "Please enter text to convert."
|
| 354 |
+
|
| 355 |
+
# Parse the multi-speaker text
|
| 356 |
+
speaker_segments = await parse_multi_speaker_text(text)
|
| 357 |
+
if not speaker_segments:
|
| 358 |
+
return None, None, "No valid speaker segments found in the text."
|
| 359 |
+
|
| 360 |
+
# Create temporary file for final audio
|
| 361 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
|
| 362 |
+
final_audio_path = tmp_file.name
|
| 363 |
+
|
| 364 |
+
subtitle_path = None
|
| 365 |
+
if generate_subtitles:
|
| 366 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".srt") as srt_file:
|
| 367 |
+
subtitle_path = srt_file.name
|
| 368 |
+
|
| 369 |
+
# Process each speaker segment with the corresponding voice
|
| 370 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 371 |
+
audio_segments = []
|
| 372 |
+
subtitle_entries = []
|
| 373 |
+
current_offset = 0 # Track the time offset in milliseconds
|
| 374 |
+
|
| 375 |
+
for i, segment in enumerate(speaker_segments):
|
| 376 |
+
speaker = segment['speaker']
|
| 377 |
+
text = segment['text']
|
| 378 |
+
|
| 379 |
+
# Get the voice for this speaker
|
| 380 |
+
speaker_num = int(re.search(r'\d+', speaker).group()) if re.search(r'\d+', speaker) else 1
|
| 381 |
+
speaker_idx = min(speaker_num - 1, len(speaker_settings) - 1) # Ensure we don't go out of bounds
|
| 382 |
+
|
| 383 |
+
if speaker_idx < 0 or speaker_idx >= len(speaker_settings) or not speaker_settings[speaker_idx]['voice']:
|
| 384 |
+
return None, None, f"No voice selected for {speaker}."
|
| 385 |
+
|
| 386 |
+
# Get voice, rate, and pitch for this speaker
|
| 387 |
+
voice_short_name = speaker_settings[speaker_idx]['voice'].split(" - ")[0]
|
| 388 |
+
rate_str = f"{speaker_settings[speaker_idx]['rate']:+d}%"
|
| 389 |
+
pitch_str = f"{speaker_settings[speaker_idx]['pitch']:+d}Hz"
|
| 390 |
+
|
| 391 |
+
# Create temporary file for this segment
|
| 392 |
+
segment_file = os.path.join(temp_dir, f"segment_{i}.mp3")
|
| 393 |
+
|
| 394 |
+
# Generate audio for this segment with speaker-specific settings
|
| 395 |
+
communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
|
| 396 |
+
|
| 397 |
+
# For subtitle generation, we need word boundaries
|
| 398 |
+
if generate_subtitles:
|
| 399 |
+
word_boundaries = []
|
| 400 |
+
async for chunk in communicate.stream():
|
| 401 |
+
if chunk["type"] == "audio":
|
| 402 |
+
with open(segment_file, "ab") as audio_file:
|
| 403 |
+
audio_file.write(chunk["data"])
|
| 404 |
+
elif chunk["type"] == "WordBoundary":
|
| 405 |
+
# Adjust offset to account for previous segments
|
| 406 |
+
adjusted_chunk = chunk.copy()
|
| 407 |
+
adjusted_chunk["offset"] += current_offset * 10000 # Convert ms to 100ns units
|
| 408 |
+
word_boundaries.append(adjusted_chunk)
|
| 409 |
+
|
| 410 |
+
# Process word boundaries for subtitles
|
| 411 |
+
if word_boundaries:
|
| 412 |
+
# Group words into phrases for subtitles
|
| 413 |
+
phrases = []
|
| 414 |
+
current_phrase = []
|
| 415 |
+
current_text = ""
|
| 416 |
+
phrase_start = 0
|
| 417 |
+
|
| 418 |
+
for j, boundary in enumerate(word_boundaries):
|
| 419 |
+
word = boundary["text"]
|
| 420 |
+
start_time = boundary["offset"] / 10000
|
| 421 |
+
duration = boundary["duration"] / 10000
|
| 422 |
+
end_time = start_time + duration
|
| 423 |
+
|
| 424 |
+
if not current_phrase:
|
| 425 |
+
phrase_start = start_time
|
| 426 |
+
|
| 427 |
+
current_phrase.append(boundary)
|
| 428 |
+
|
| 429 |
+
if word in ['.', ',', '!', '?', ':', ';'] or word.startswith(('.', ',', '!', '?', ':', ';')):
|
| 430 |
+
current_text = current_text.rstrip() + word + " "
|
| 431 |
+
else:
|
| 432 |
+
current_text += word + " "
|
| 433 |
+
|
| 434 |
+
# Determine if we should end this phrase
|
| 435 |
+
should_break = False
|
| 436 |
+
|
| 437 |
+
if word.endswith(('.', '!', '?', ':', ';', ',')) or j == len(word_boundaries) - 1:
|
| 438 |
+
should_break = True
|
| 439 |
+
elif len(current_phrase) >= 5:
|
| 440 |
+
should_break = True
|
| 441 |
+
elif j < len(word_boundaries) - 1:
|
| 442 |
+
next_start = word_boundaries[j + 1]["offset"] / 10000
|
| 443 |
+
if next_start - end_time > 300:
|
| 444 |
+
should_break = True
|
| 445 |
+
|
| 446 |
+
if should_break or j == len(word_boundaries) - 1:
|
| 447 |
+
if current_phrase:
|
| 448 |
+
last_boundary = current_phrase[-1]
|
| 449 |
+
phrase_end = (last_boundary["offset"] + last_boundary["duration"]) / 10000
|
| 450 |
+
phrases.append({
|
| 451 |
+
"text": f"[{speaker}] {current_text.strip()}",
|
| 452 |
+
"start": phrase_start,
|
| 453 |
+
"end": phrase_end
|
| 454 |
+
})
|
| 455 |
+
subtitle_entries.extend(phrases)
|
| 456 |
+
current_phrase = []
|
| 457 |
+
current_text = ""
|
| 458 |
+
else:
|
| 459 |
+
# Simple audio generation without subtitles
|
| 460 |
+
await communicate.save(segment_file)
|
| 461 |
+
|
| 462 |
+
# Get duration of the generated audio
|
| 463 |
+
from pydub import AudioSegment
|
| 464 |
+
audio = AudioSegment.from_file(segment_file)
|
| 465 |
+
duration = len(audio)
|
| 466 |
+
|
| 467 |
+
audio_segments.append({
|
| 468 |
+
'file': segment_file,
|
| 469 |
+
'duration': duration
|
| 470 |
+
})
|
| 471 |
+
|
| 472 |
+
# Update the current offset for the next segment
|
| 473 |
+
current_offset += duration
|
| 474 |
+
|
| 475 |
+
# Combine all audio segments
|
| 476 |
+
from pydub import AudioSegment
|
| 477 |
+
|
| 478 |
+
combined = AudioSegment.empty()
|
| 479 |
+
for segment in audio_segments:
|
| 480 |
+
audio = AudioSegment.from_file(segment['file'])
|
| 481 |
+
combined += audio
|
| 482 |
+
|
| 483 |
+
combined.export(final_audio_path, format="mp3")
|
| 484 |
+
|
| 485 |
+
# Generate subtitles file if requested
|
| 486 |
+
if generate_subtitles and subtitle_path:
|
| 487 |
+
with open(subtitle_path, "w", encoding="utf-8") as f:
|
| 488 |
+
for i, entry in enumerate(subtitle_entries):
|
| 489 |
+
f.write(f"{i+1}\n")
|
| 490 |
+
f.write(f"{format_time(entry['start'])} --> {format_time(entry['end'])}\n")
|
| 491 |
+
f.write(f"{entry['text']}\n\n")
|
| 492 |
+
|
| 493 |
+
return final_audio_path, subtitle_path, None
|
| 494 |
+
|
| 495 |
+
async def multi_speaker_interface(text, generate_subtitles, speaker1_voice, speaker1_rate, speaker1_pitch,
|
| 496 |
+
speaker2_voice, speaker2_rate, speaker2_pitch):
|
| 497 |
+
"""Interface function for multi-speaker TTS"""
|
| 498 |
+
# Create speaker settings from individual parameters
|
| 499 |
+
speaker_settings = []
|
| 500 |
+
|
| 501 |
+
# Add Speaker 1 if voice is selected
|
| 502 |
+
if speaker1_voice:
|
| 503 |
+
speaker_settings.append({
|
| 504 |
+
'voice': speaker1_voice,
|
| 505 |
+
'rate': speaker1_rate,
|
| 506 |
+
'pitch': speaker1_pitch
|
| 507 |
+
})
|
| 508 |
+
|
| 509 |
+
# Add Speaker 2 if voice is selected
|
| 510 |
+
if speaker2_voice:
|
| 511 |
+
speaker_settings.append({
|
| 512 |
+
'voice': speaker2_voice,
|
| 513 |
+
'rate': speaker2_rate,
|
| 514 |
+
'pitch': speaker2_pitch
|
| 515 |
+
})
|
| 516 |
+
|
| 517 |
+
if not speaker_settings:
|
| 518 |
+
return None, None, gr.Warning("Please select at least one speaker voice.")
|
| 519 |
+
|
| 520 |
+
audio, subtitle, warning = await multi_speaker_tts(text, speaker_settings, generate_subtitles)
|
| 521 |
+
if warning:
|
| 522 |
+
return audio, subtitle, gr.Warning(warning)
|
| 523 |
+
return audio, subtitle, None
|
| 524 |
+
|
| 525 |
async def create_demo():
|
| 526 |
voices = await get_voices()
|
| 527 |
|
|
|
|
| 533 |
|
| 534 |
features = """
|
| 535 |
## ✨ Latest Features
|
| 536 |
+
- **Single & Multi-Speaker Support**: Choose between single speaker or multi-speaker modes
|
| 537 |
- **SRT Subtitle Support**: Upload SRT files or input SRT format text to generate perfectly synchronized speech
|
| 538 |
- **SRT Generation**: Create subtitle files alongside your audio for perfect timing
|
| 539 |
- **File Upload**: Easily upload TXT or SRT files for conversion
|
|
|
|
| 545 |
gr.Markdown(description)
|
| 546 |
gr.Markdown(features)
|
| 547 |
|
| 548 |
+
with gr.Tabs() as tabs:
|
| 549 |
+
with gr.Tab("Single Speaker"):
|
| 550 |
+
with gr.Row():
|
| 551 |
+
with gr.Column(scale=3):
|
| 552 |
+
text_input = gr.Textbox(label="Input Text", lines=5, value="Hello, how are you doing!")
|
| 553 |
+
file_input = gr.File(label="Or upload a TXT/SRT file", file_types=[".txt", ".srt"])
|
| 554 |
+
with gr.Column(scale=2):
|
| 555 |
+
voice_dropdown = gr.Dropdown(
|
| 556 |
+
choices=[""] + list(voices.keys()),
|
| 557 |
+
label="Select Voice",
|
| 558 |
+
value=list(voices.keys())[0] if voices else "",
|
| 559 |
+
)
|
| 560 |
+
rate_slider = gr.Slider(
|
| 561 |
+
minimum=-50,
|
| 562 |
+
maximum=50,
|
| 563 |
+
value=0,
|
| 564 |
+
label="Speech Rate Adjustment (%)",
|
| 565 |
+
step=1,
|
| 566 |
+
)
|
| 567 |
+
pitch_slider = gr.Slider(
|
| 568 |
+
minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1
|
| 569 |
+
)
|
| 570 |
+
subtitle_checkbox = gr.Checkbox(label="Generate Subtitles (.srt)", value=False)
|
| 571 |
+
gr.Markdown("""
|
| 572 |
+
**📝 Subtitle Timing Tip:**
|
| 573 |
+
|
| 574 |
+
When creating SRT files for continuous speech, avoid exact matching timestamps between segments.
|
| 575 |
+
|
| 576 |
+
**For smoother speech flow:**
|
| 577 |
+
```
|
| 578 |
+
1
|
| 579 |
+
00:00:00,112 --> 00:00:01,647
|
| 580 |
+
Hello how are you doing
|
| 581 |
+
|
| 582 |
+
2
|
| 583 |
+
00:00:01,617 --> 00:00:02,000
|
| 584 |
+
I'm fine
|
| 585 |
+
```
|
| 586 |
+
|
| 587 |
+
✅ Create a small overlap (20-30ms) between segments to prevent pauses
|
| 588 |
+
❌ Avoid exact matching timestamps (where end time = next start time) except you want a pause
|
| 589 |
+
""")
|
| 590 |
+
|
| 591 |
+
submit_single_btn = gr.Button("Convert to Speech", variant="primary")
|
| 592 |
+
warning_single_md = gr.Markdown(visible=False)
|
| 593 |
+
|
| 594 |
+
single_outputs = [
|
| 595 |
+
gr.Audio(label="Generated Audio", type="filepath"),
|
| 596 |
+
gr.File(label="Generated Subtitles"),
|
| 597 |
+
warning_single_md
|
| 598 |
+
]
|
| 599 |
+
|
| 600 |
+
# Handle file upload to update text
|
| 601 |
+
file_input.change(
|
| 602 |
+
fn=update_text_from_file,
|
| 603 |
+
inputs=[file_input],
|
| 604 |
+
outputs=[text_input, warning_single_md]
|
| 605 |
)
|
| 606 |
+
|
| 607 |
+
# Handle submit button for single speaker
|
| 608 |
+
submit_single_btn.click(
|
| 609 |
+
fn=tts_interface,
|
| 610 |
+
api_name="predict",
|
| 611 |
+
inputs=[text_input, voice_dropdown, rate_slider, pitch_slider, subtitle_checkbox, file_input],
|
| 612 |
+
outputs=single_outputs
|
| 613 |
)
|
| 614 |
+
|
| 615 |
+
with gr.Tab("Multi Speaker"):
|
| 616 |
+
with gr.Column():
|
| 617 |
+
multi_text_input = gr.Textbox(
|
| 618 |
+
label="Multi-Speaker Text (Format: 'Speaker1: text' or 'S1: text')",
|
| 619 |
+
lines=8,
|
| 620 |
+
value="Speaker1: Hello, this is the first speaker.\nSpeaker2: And I'm the second speaker!"
|
| 621 |
+
)
|
| 622 |
+
multi_subtitle_checkbox = gr.Checkbox(label="Generate Subtitles (.srt)", value=False)
|
|
|
|
|
|
|
| 623 |
|
| 624 |
+
with gr.Row():
|
| 625 |
+
with gr.Column():
|
| 626 |
+
speaker1_voice = gr.Dropdown(
|
| 627 |
+
choices=[""] + list(voices.keys()),
|
| 628 |
+
label="Speaker 1 Voice",
|
| 629 |
+
value=list(voices.keys())[0] if voices else "",
|
| 630 |
+
)
|
| 631 |
+
speaker1_rate = gr.Slider(
|
| 632 |
+
minimum=-50,
|
| 633 |
+
maximum=50,
|
| 634 |
+
value=0,
|
| 635 |
+
label="Speaker 1 Rate (%)",
|
| 636 |
+
step=1,
|
| 637 |
+
)
|
| 638 |
+
speaker1_pitch = gr.Slider(
|
| 639 |
+
minimum=-20,
|
| 640 |
+
maximum=20,
|
| 641 |
+
value=0,
|
| 642 |
+
label="Speaker 1 Pitch (Hz)",
|
| 643 |
+
step=1,
|
| 644 |
+
)
|
| 645 |
+
|
| 646 |
+
with gr.Column():
|
| 647 |
+
speaker2_voice = gr.Dropdown(
|
| 648 |
+
choices=[""] + list(voices.keys()),
|
| 649 |
+
label="Speaker 2 Voice",
|
| 650 |
+
value=list(voices.keys())[10] if len(voices) > 10 else "",
|
| 651 |
+
)
|
| 652 |
+
speaker2_rate = gr.Slider(
|
| 653 |
+
minimum=-50,
|
| 654 |
+
maximum=50,
|
| 655 |
+
value=0,
|
| 656 |
+
label="Speaker 2 Rate (%)",
|
| 657 |
+
step=1,
|
| 658 |
+
)
|
| 659 |
+
speaker2_pitch = gr.Slider(
|
| 660 |
+
minimum=-20,
|
| 661 |
+
maximum=20,
|
| 662 |
+
value=0,
|
| 663 |
+
label="Speaker 2 Pitch (Hz)",
|
| 664 |
+
step=1,
|
| 665 |
+
)
|
| 666 |
|
| 667 |
+
submit_multi_btn = gr.Button("Convert Multi-Speaker to Speech", variant="primary")
|
| 668 |
+
warning_multi_md = gr.Markdown(visible=False)
|
| 669 |
+
|
| 670 |
+
multi_outputs = [
|
| 671 |
+
gr.Audio(label="Generated Audio", type="filepath"),
|
| 672 |
+
gr.File(label="Generated Subtitles"),
|
| 673 |
+
warning_multi_md
|
| 674 |
+
]
|
| 675 |
+
|
| 676 |
+
# Correctly pass the individual Gradio components to the click function
|
| 677 |
+
submit_multi_btn.click(
|
| 678 |
+
fn=multi_speaker_interface,
|
| 679 |
+
api_name="predict_multi",
|
| 680 |
+
inputs=[
|
| 681 |
+
multi_text_input,
|
| 682 |
+
multi_subtitle_checkbox,
|
| 683 |
+
speaker1_voice,
|
| 684 |
+
speaker1_rate,
|
| 685 |
+
speaker1_pitch,
|
| 686 |
+
speaker2_voice,
|
| 687 |
+
speaker2_rate,
|
| 688 |
+
speaker2_pitch
|
| 689 |
+
],
|
| 690 |
+
outputs=multi_outputs
|
| 691 |
+
)
|
|
|
|
|
|
|
| 692 |
|
| 693 |
+
gr.Markdown("Experience the power of Edge TTS for text-to-speech conversion with support for both single speaker and multi-speaker scenarios!")
|
| 694 |
|
| 695 |
return demo
|
| 696 |
|
|
|
|
| 703 |
|
| 704 |
if __name__ == "__main__":
|
| 705 |
asyncio.run(main())
|
| 706 |
+
|