walidadebayo commited on
Commit
4f2415a
·
1 Parent(s): 16311fa

Add multi-speaker support and interface for text-to-speech conversion

Browse files
Files changed (1) hide show
  1. app.py +354 -61
app.py CHANGED
@@ -311,6 +311,217 @@ async def tts_interface(text, voice, rate, pitch, generate_subtitles, uploaded_f
311
  return audio, subtitle, None
312
 
313
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  async def create_demo():
315
  voices = await get_voices()
316
 
@@ -322,6 +533,7 @@ async def create_demo():
322
 
323
  features = """
324
  ## ✨ Latest Features
 
325
  - **SRT Subtitle Support**: Upload SRT files or input SRT format text to generate perfectly synchronized speech
326
  - **SRT Generation**: Create subtitle files alongside your audio for perfect timing
327
  - **File Upload**: Easily upload TXT or SRT files for conversion
@@ -333,72 +545,152 @@ async def create_demo():
333
  gr.Markdown(description)
334
  gr.Markdown(features)
335
 
336
- with gr.Row():
337
- with gr.Column(scale=3):
338
- text_input = gr.Textbox(label="Input Text", lines=5, value="Hello, how are you doing!")
339
- file_input = gr.File(label="Or upload a TXT/SRT file", file_types=[".txt", ".srt"])
340
- with gr.Column(scale=2):
341
- voice_dropdown = gr.Dropdown(
342
- choices=[""] + list(voices.keys()),
343
- label="Select Voice",
344
- value=list(voices.keys())[0] if voices else "",
345
- )
346
- rate_slider = gr.Slider(
347
- minimum=-50,
348
- maximum=50,
349
- value=0,
350
- label="Speech Rate Adjustment (%)",
351
- step=1,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
  )
353
- pitch_slider = gr.Slider(
354
- minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1
 
 
 
 
 
355
  )
356
- subtitle_checkbox = gr.Checkbox(label="Generate Subtitles (.srt)", value=False)
357
- gr.Markdown("""
358
- **📝 Subtitle Timing Tip:**
359
-
360
- When creating SRT files for continuous speech, avoid exact matching timestamps between segments.
361
-
362
- **For smoother speech flow:**
363
- ```
364
- 1
365
- 00:00:00,112 --> 00:00:01,647
366
- Hello how are you doing
367
 
368
- 2
369
- 00:00:01,617 --> 00:00:02,000
370
- I'm fine
371
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
 
373
- Create a small overlap (20-30ms) between segments to prevent pauses
374
- Avoid exact matching timestamps (where end time = next start time) except you want a pause
375
- """)
376
-
377
- submit_btn = gr.Button("Convert to Speech", variant="primary")
378
- warning_md = gr.Markdown(visible=False)
379
-
380
- outputs = [
381
- gr.Audio(label="Generated Audio", type="filepath"),
382
- gr.File(label="Generated Subtitles"),
383
- warning_md
384
- ]
385
-
386
- # Handle file upload to update text
387
- file_input.change(
388
- fn=update_text_from_file,
389
- inputs=[file_input],
390
- outputs=[text_input, warning_md]
391
- )
392
-
393
- # Handle submit button
394
- submit_btn.click(
395
- fn=tts_interface,
396
- api_name="predict",
397
- inputs=[text_input, voice_dropdown, rate_slider, pitch_slider, subtitle_checkbox, file_input],
398
- outputs=outputs
399
- )
400
 
401
- gr.Markdown("Experience the power of Edge TTS for text-to-speech conversion, and explore our advanced Text-to-Video Converter for even more creative possibilities!")
402
 
403
  return demo
404
 
@@ -411,3 +703,4 @@ async def main():
411
 
412
  if __name__ == "__main__":
413
  asyncio.run(main())
 
 
311
  return audio, subtitle, None
312
 
313
 
314
+ async def parse_multi_speaker_text(text):
315
+ """Parse text containing speaker designations like 'Speaker1: Hello'"""
316
+ lines = text.split('\n')
317
+ speaker_segments = []
318
+ current_speaker = None
319
+ current_text = []
320
+
321
+ speaker_pattern = re.compile(r'^(Speaker\s*\d+|S\d+)\s*:\s*(.*)$', re.IGNORECASE)
322
+
323
+ for line in lines:
324
+ match = speaker_pattern.match(line.strip())
325
+ if match:
326
+ # If collecting text for a previous speaker, save it
327
+ if current_speaker and current_text:
328
+ speaker_segments.append({
329
+ 'speaker': current_speaker,
330
+ 'text': ' '.join(current_text).strip()
331
+ })
332
+ current_text = []
333
+
334
+ # Set the new current speaker and start collecting their text
335
+ current_speaker = match.group(1).strip()
336
+ if match.group(2).strip(): # If there's text after the speaker designation
337
+ current_text.append(match.group(2).strip())
338
+ elif line.strip() and current_speaker: # Continue with the current speaker
339
+ current_text.append(line.strip())
340
+
341
+ # Add the last speaker's text if any
342
+ if current_speaker and current_text:
343
+ speaker_segments.append({
344
+ 'speaker': current_speaker,
345
+ 'text': ' '.join(current_text).strip()
346
+ })
347
+
348
+ return speaker_segments
349
+
350
+ async def multi_speaker_tts(text, speaker_settings, generate_subtitles=False):
351
+ """Process multi-speaker text and generate audio with different voices and settings"""
352
+ if not text.strip():
353
+ return None, None, "Please enter text to convert."
354
+
355
+ # Parse the multi-speaker text
356
+ speaker_segments = await parse_multi_speaker_text(text)
357
+ if not speaker_segments:
358
+ return None, None, "No valid speaker segments found in the text."
359
+
360
+ # Create temporary file for final audio
361
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
362
+ final_audio_path = tmp_file.name
363
+
364
+ subtitle_path = None
365
+ if generate_subtitles:
366
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".srt") as srt_file:
367
+ subtitle_path = srt_file.name
368
+
369
+ # Process each speaker segment with the corresponding voice
370
+ with tempfile.TemporaryDirectory() as temp_dir:
371
+ audio_segments = []
372
+ subtitle_entries = []
373
+ current_offset = 0 # Track the time offset in milliseconds
374
+
375
+ for i, segment in enumerate(speaker_segments):
376
+ speaker = segment['speaker']
377
+ text = segment['text']
378
+
379
+ # Get the voice for this speaker
380
+ speaker_num = int(re.search(r'\d+', speaker).group()) if re.search(r'\d+', speaker) else 1
381
+ speaker_idx = min(speaker_num - 1, len(speaker_settings) - 1) # Ensure we don't go out of bounds
382
+
383
+ if speaker_idx < 0 or speaker_idx >= len(speaker_settings) or not speaker_settings[speaker_idx]['voice']:
384
+ return None, None, f"No voice selected for {speaker}."
385
+
386
+ # Get voice, rate, and pitch for this speaker
387
+ voice_short_name = speaker_settings[speaker_idx]['voice'].split(" - ")[0]
388
+ rate_str = f"{speaker_settings[speaker_idx]['rate']:+d}%"
389
+ pitch_str = f"{speaker_settings[speaker_idx]['pitch']:+d}Hz"
390
+
391
+ # Create temporary file for this segment
392
+ segment_file = os.path.join(temp_dir, f"segment_{i}.mp3")
393
+
394
+ # Generate audio for this segment with speaker-specific settings
395
+ communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
396
+
397
+ # For subtitle generation, we need word boundaries
398
+ if generate_subtitles:
399
+ word_boundaries = []
400
+ async for chunk in communicate.stream():
401
+ if chunk["type"] == "audio":
402
+ with open(segment_file, "ab") as audio_file:
403
+ audio_file.write(chunk["data"])
404
+ elif chunk["type"] == "WordBoundary":
405
+ # Adjust offset to account for previous segments
406
+ adjusted_chunk = chunk.copy()
407
+ adjusted_chunk["offset"] += current_offset * 10000 # Convert ms to 100ns units
408
+ word_boundaries.append(adjusted_chunk)
409
+
410
+ # Process word boundaries for subtitles
411
+ if word_boundaries:
412
+ # Group words into phrases for subtitles
413
+ phrases = []
414
+ current_phrase = []
415
+ current_text = ""
416
+ phrase_start = 0
417
+
418
+ for j, boundary in enumerate(word_boundaries):
419
+ word = boundary["text"]
420
+ start_time = boundary["offset"] / 10000
421
+ duration = boundary["duration"] / 10000
422
+ end_time = start_time + duration
423
+
424
+ if not current_phrase:
425
+ phrase_start = start_time
426
+
427
+ current_phrase.append(boundary)
428
+
429
+ if word in ['.', ',', '!', '?', ':', ';'] or word.startswith(('.', ',', '!', '?', ':', ';')):
430
+ current_text = current_text.rstrip() + word + " "
431
+ else:
432
+ current_text += word + " "
433
+
434
+ # Determine if we should end this phrase
435
+ should_break = False
436
+
437
+ if word.endswith(('.', '!', '?', ':', ';', ',')) or j == len(word_boundaries) - 1:
438
+ should_break = True
439
+ elif len(current_phrase) >= 5:
440
+ should_break = True
441
+ elif j < len(word_boundaries) - 1:
442
+ next_start = word_boundaries[j + 1]["offset"] / 10000
443
+ if next_start - end_time > 300:
444
+ should_break = True
445
+
446
+ if should_break or j == len(word_boundaries) - 1:
447
+ if current_phrase:
448
+ last_boundary = current_phrase[-1]
449
+ phrase_end = (last_boundary["offset"] + last_boundary["duration"]) / 10000
450
+ phrases.append({
451
+ "text": f"[{speaker}] {current_text.strip()}",
452
+ "start": phrase_start,
453
+ "end": phrase_end
454
+ })
455
+ subtitle_entries.extend(phrases)
456
+ current_phrase = []
457
+ current_text = ""
458
+ else:
459
+ # Simple audio generation without subtitles
460
+ await communicate.save(segment_file)
461
+
462
+ # Get duration of the generated audio
463
+ from pydub import AudioSegment
464
+ audio = AudioSegment.from_file(segment_file)
465
+ duration = len(audio)
466
+
467
+ audio_segments.append({
468
+ 'file': segment_file,
469
+ 'duration': duration
470
+ })
471
+
472
+ # Update the current offset for the next segment
473
+ current_offset += duration
474
+
475
+ # Combine all audio segments
476
+ from pydub import AudioSegment
477
+
478
+ combined = AudioSegment.empty()
479
+ for segment in audio_segments:
480
+ audio = AudioSegment.from_file(segment['file'])
481
+ combined += audio
482
+
483
+ combined.export(final_audio_path, format="mp3")
484
+
485
+ # Generate subtitles file if requested
486
+ if generate_subtitles and subtitle_path:
487
+ with open(subtitle_path, "w", encoding="utf-8") as f:
488
+ for i, entry in enumerate(subtitle_entries):
489
+ f.write(f"{i+1}\n")
490
+ f.write(f"{format_time(entry['start'])} --> {format_time(entry['end'])}\n")
491
+ f.write(f"{entry['text']}\n\n")
492
+
493
+ return final_audio_path, subtitle_path, None
494
+
495
+ async def multi_speaker_interface(text, generate_subtitles, speaker1_voice, speaker1_rate, speaker1_pitch,
496
+ speaker2_voice, speaker2_rate, speaker2_pitch):
497
+ """Interface function for multi-speaker TTS"""
498
+ # Create speaker settings from individual parameters
499
+ speaker_settings = []
500
+
501
+ # Add Speaker 1 if voice is selected
502
+ if speaker1_voice:
503
+ speaker_settings.append({
504
+ 'voice': speaker1_voice,
505
+ 'rate': speaker1_rate,
506
+ 'pitch': speaker1_pitch
507
+ })
508
+
509
+ # Add Speaker 2 if voice is selected
510
+ if speaker2_voice:
511
+ speaker_settings.append({
512
+ 'voice': speaker2_voice,
513
+ 'rate': speaker2_rate,
514
+ 'pitch': speaker2_pitch
515
+ })
516
+
517
+ if not speaker_settings:
518
+ return None, None, gr.Warning("Please select at least one speaker voice.")
519
+
520
+ audio, subtitle, warning = await multi_speaker_tts(text, speaker_settings, generate_subtitles)
521
+ if warning:
522
+ return audio, subtitle, gr.Warning(warning)
523
+ return audio, subtitle, None
524
+
525
  async def create_demo():
526
  voices = await get_voices()
527
 
 
533
 
534
  features = """
535
  ## ✨ Latest Features
536
+ - **Single & Multi-Speaker Support**: Choose between single speaker or multi-speaker modes
537
  - **SRT Subtitle Support**: Upload SRT files or input SRT format text to generate perfectly synchronized speech
538
  - **SRT Generation**: Create subtitle files alongside your audio for perfect timing
539
  - **File Upload**: Easily upload TXT or SRT files for conversion
 
545
  gr.Markdown(description)
546
  gr.Markdown(features)
547
 
548
+ with gr.Tabs() as tabs:
549
+ with gr.Tab("Single Speaker"):
550
+ with gr.Row():
551
+ with gr.Column(scale=3):
552
+ text_input = gr.Textbox(label="Input Text", lines=5, value="Hello, how are you doing!")
553
+ file_input = gr.File(label="Or upload a TXT/SRT file", file_types=[".txt", ".srt"])
554
+ with gr.Column(scale=2):
555
+ voice_dropdown = gr.Dropdown(
556
+ choices=[""] + list(voices.keys()),
557
+ label="Select Voice",
558
+ value=list(voices.keys())[0] if voices else "",
559
+ )
560
+ rate_slider = gr.Slider(
561
+ minimum=-50,
562
+ maximum=50,
563
+ value=0,
564
+ label="Speech Rate Adjustment (%)",
565
+ step=1,
566
+ )
567
+ pitch_slider = gr.Slider(
568
+ minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1
569
+ )
570
+ subtitle_checkbox = gr.Checkbox(label="Generate Subtitles (.srt)", value=False)
571
+ gr.Markdown("""
572
+ **📝 Subtitle Timing Tip:**
573
+
574
+ When creating SRT files for continuous speech, avoid exact matching timestamps between segments.
575
+
576
+ **For smoother speech flow:**
577
+ ```
578
+ 1
579
+ 00:00:00,112 --> 00:00:01,647
580
+ Hello how are you doing
581
+
582
+ 2
583
+ 00:00:01,617 --> 00:00:02,000
584
+ I'm fine
585
+ ```
586
+
587
+ ✅ Create a small overlap (20-30ms) between segments to prevent pauses
588
+ ❌ Avoid exact matching timestamps (where end time = next start time) except you want a pause
589
+ """)
590
+
591
+ submit_single_btn = gr.Button("Convert to Speech", variant="primary")
592
+ warning_single_md = gr.Markdown(visible=False)
593
+
594
+ single_outputs = [
595
+ gr.Audio(label="Generated Audio", type="filepath"),
596
+ gr.File(label="Generated Subtitles"),
597
+ warning_single_md
598
+ ]
599
+
600
+ # Handle file upload to update text
601
+ file_input.change(
602
+ fn=update_text_from_file,
603
+ inputs=[file_input],
604
+ outputs=[text_input, warning_single_md]
605
  )
606
+
607
+ # Handle submit button for single speaker
608
+ submit_single_btn.click(
609
+ fn=tts_interface,
610
+ api_name="predict",
611
+ inputs=[text_input, voice_dropdown, rate_slider, pitch_slider, subtitle_checkbox, file_input],
612
+ outputs=single_outputs
613
  )
614
+
615
+ with gr.Tab("Multi Speaker"):
616
+ with gr.Column():
617
+ multi_text_input = gr.Textbox(
618
+ label="Multi-Speaker Text (Format: 'Speaker1: text' or 'S1: text')",
619
+ lines=8,
620
+ value="Speaker1: Hello, this is the first speaker.\nSpeaker2: And I'm the second speaker!"
621
+ )
622
+ multi_subtitle_checkbox = gr.Checkbox(label="Generate Subtitles (.srt)", value=False)
 
 
623
 
624
+ with gr.Row():
625
+ with gr.Column():
626
+ speaker1_voice = gr.Dropdown(
627
+ choices=[""] + list(voices.keys()),
628
+ label="Speaker 1 Voice",
629
+ value=list(voices.keys())[0] if voices else "",
630
+ )
631
+ speaker1_rate = gr.Slider(
632
+ minimum=-50,
633
+ maximum=50,
634
+ value=0,
635
+ label="Speaker 1 Rate (%)",
636
+ step=1,
637
+ )
638
+ speaker1_pitch = gr.Slider(
639
+ minimum=-20,
640
+ maximum=20,
641
+ value=0,
642
+ label="Speaker 1 Pitch (Hz)",
643
+ step=1,
644
+ )
645
+
646
+ with gr.Column():
647
+ speaker2_voice = gr.Dropdown(
648
+ choices=[""] + list(voices.keys()),
649
+ label="Speaker 2 Voice",
650
+ value=list(voices.keys())[10] if len(voices) > 10 else "",
651
+ )
652
+ speaker2_rate = gr.Slider(
653
+ minimum=-50,
654
+ maximum=50,
655
+ value=0,
656
+ label="Speaker 2 Rate (%)",
657
+ step=1,
658
+ )
659
+ speaker2_pitch = gr.Slider(
660
+ minimum=-20,
661
+ maximum=20,
662
+ value=0,
663
+ label="Speaker 2 Pitch (Hz)",
664
+ step=1,
665
+ )
666
 
667
+ submit_multi_btn = gr.Button("Convert Multi-Speaker to Speech", variant="primary")
668
+ warning_multi_md = gr.Markdown(visible=False)
669
+
670
+ multi_outputs = [
671
+ gr.Audio(label="Generated Audio", type="filepath"),
672
+ gr.File(label="Generated Subtitles"),
673
+ warning_multi_md
674
+ ]
675
+
676
+ # Correctly pass the individual Gradio components to the click function
677
+ submit_multi_btn.click(
678
+ fn=multi_speaker_interface,
679
+ api_name="predict_multi",
680
+ inputs=[
681
+ multi_text_input,
682
+ multi_subtitle_checkbox,
683
+ speaker1_voice,
684
+ speaker1_rate,
685
+ speaker1_pitch,
686
+ speaker2_voice,
687
+ speaker2_rate,
688
+ speaker2_pitch
689
+ ],
690
+ outputs=multi_outputs
691
+ )
 
 
692
 
693
+ gr.Markdown("Experience the power of Edge TTS for text-to-speech conversion with support for both single speaker and multi-speaker scenarios!")
694
 
695
  return demo
696
 
 
703
 
704
  if __name__ == "__main__":
705
  asyncio.run(main())
706
+