text-to-speech-clone

Build error

App Files Files Community

walidadebayo commited on May 24, 2025

Commit

4f2415a

1 Parent(s): 16311fa

Add multi-speaker support and interface for text-to-speech conversion

Browse files

Files changed (1) hide show

app.py +354 -61

app.py CHANGED Viewed

@@ -311,6 +311,217 @@ async def tts_interface(text, voice, rate, pitch, generate_subtitles, uploaded_f
     return audio, subtitle, None
 async def create_demo():
     voices = await get_voices()
@@ -322,6 +533,7 @@ async def create_demo():
     features = """
     ## ✨ Latest Features
     - **SRT Subtitle Support**: Upload SRT files or input SRT format text to generate perfectly synchronized speech
     - **SRT Generation**: Create subtitle files alongside your audio for perfect timing
     - **File Upload**: Easily upload TXT or SRT files for conversion
@@ -333,72 +545,152 @@ async def create_demo():
         gr.Markdown(description)
         gr.Markdown(features)
-        with gr.Row():
-            with gr.Column(scale=3):
-                text_input = gr.Textbox(label="Input Text", lines=5, value="Hello, how are you doing!")
-                file_input = gr.File(label="Or upload a TXT/SRT file", file_types=[".txt", ".srt"])
-            with gr.Column(scale=2):
-                voice_dropdown = gr.Dropdown(
-                    choices=[""] + list(voices.keys()),
-                    label="Select Voice",
-                    value=list(voices.keys())[0] if voices else "",
-                )
-                rate_slider = gr.Slider(
-                    minimum=-50,
-                    maximum=50,
-                    value=0,
-                    label="Speech Rate Adjustment (%)",
-                    step=1,
                 )
-                pitch_slider = gr.Slider(
-                    minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1
                 )
-                subtitle_checkbox = gr.Checkbox(label="Generate Subtitles (.srt)", value=False)
-                gr.Markdown("""
-                    **📝 Subtitle Timing Tip:**
-                    When creating SRT files for continuous speech, avoid exact matching timestamps between segments.
-                    **For smoother speech flow:**
-                    ```
-                    1
-                    00:00:00,112 --> 00:00:01,647
-                    Hello how are you doing
-                    2
-                    00:00:01,617 --> 00:00:02,000
-                    I'm fine
-                    ```
-                    ✅ Create a small overlap (20-30ms) between segments to prevent pauses
-                    ❌ Avoid exact matching timestamps (where end time = next start time) except you want a pause
-                """)
-        submit_btn = gr.Button("Convert to Speech", variant="primary")
-        warning_md = gr.Markdown(visible=False)
-        outputs = [
-            gr.Audio(label="Generated Audio", type="filepath"),
-            gr.File(label="Generated Subtitles"),
-            warning_md
-        ]
-        # Handle file upload to update text
-        file_input.change(
-            fn=update_text_from_file,
-            inputs=[file_input],
-            outputs=[text_input, warning_md]
-        )
-        # Handle submit button
-        submit_btn.click(
-            fn=tts_interface,
-            api_name="predict",
-            inputs=[text_input, voice_dropdown, rate_slider, pitch_slider, subtitle_checkbox, file_input],
-            outputs=outputs
-        )
-        gr.Markdown("Experience the power of Edge TTS for text-to-speech conversion, and explore our advanced Text-to-Video Converter for even more creative possibilities!")
     return demo
@@ -411,3 +703,4 @@ async def main():
 if __name__ == "__main__":
     asyncio.run(main())

     return audio, subtitle, None
+async def parse_multi_speaker_text(text):
+    """Parse text containing speaker designations like 'Speaker1: Hello'"""
+    lines = text.split('\n')
+    speaker_segments = []
+    current_speaker = None
+    current_text = []
+    speaker_pattern = re.compile(r'^(Speaker\s*\d+|S\d+)\s*:\s*(.*)$', re.IGNORECASE)
+    for line in lines:
+        match = speaker_pattern.match(line.strip())
+        if match:
+            # If collecting text for a previous speaker, save it
+            if current_speaker and current_text:
+                speaker_segments.append({
+                    'speaker': current_speaker,
+                    'text': ' '.join(current_text).strip()
+                })
+                current_text = []
+            # Set the new current speaker and start collecting their text
+            current_speaker = match.group(1).strip()
+            if match.group(2).strip():  # If there's text after the speaker designation
+                current_text.append(match.group(2).strip())
+        elif line.strip() and current_speaker:  # Continue with the current speaker
+            current_text.append(line.strip())
+    # Add the last speaker's text if any
+    if current_speaker and current_text:
+        speaker_segments.append({
+            'speaker': current_speaker,
+            'text': ' '.join(current_text).strip()
+        })
+    return speaker_segments
+async def multi_speaker_tts(text, speaker_settings, generate_subtitles=False):
+    """Process multi-speaker text and generate audio with different voices and settings"""
+    if not text.strip():
+        return None, None, "Please enter text to convert."
+    # Parse the multi-speaker text
+    speaker_segments = await parse_multi_speaker_text(text)
+    if not speaker_segments:
+        return None, None, "No valid speaker segments found in the text."
+    # Create temporary file for final audio
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
+        final_audio_path = tmp_file.name
+    subtitle_path = None
+    if generate_subtitles:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".srt") as srt_file:
+            subtitle_path = srt_file.name
+    # Process each speaker segment with the corresponding voice
+    with tempfile.TemporaryDirectory() as temp_dir:
+        audio_segments = []
+        subtitle_entries = []
+        current_offset = 0  # Track the time offset in milliseconds
+        for i, segment in enumerate(speaker_segments):
+            speaker = segment['speaker']
+            text = segment['text']
+            # Get the voice for this speaker
+            speaker_num = int(re.search(r'\d+', speaker).group()) if re.search(r'\d+', speaker) else 1
+            speaker_idx = min(speaker_num - 1, len(speaker_settings) - 1)  # Ensure we don't go out of bounds
+            if speaker_idx < 0 or speaker_idx >= len(speaker_settings) or not speaker_settings[speaker_idx]['voice']:
+                return None, None, f"No voice selected for {speaker}."
+            # Get voice, rate, and pitch for this speaker
+            voice_short_name = speaker_settings[speaker_idx]['voice'].split(" - ")[0]
+            rate_str = f"{speaker_settings[speaker_idx]['rate']:+d}%"
+            pitch_str = f"{speaker_settings[speaker_idx]['pitch']:+d}Hz"
+            # Create temporary file for this segment
+            segment_file = os.path.join(temp_dir, f"segment_{i}.mp3")
+            # Generate audio for this segment with speaker-specific settings
+            communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
+            # For subtitle generation, we need word boundaries
+            if generate_subtitles:
+                word_boundaries = []
+                async for chunk in communicate.stream():
+                    if chunk["type"] == "audio":
+                        with open(segment_file, "ab") as audio_file:
+                            audio_file.write(chunk["data"])
+                    elif chunk["type"] == "WordBoundary":
+                        # Adjust offset to account for previous segments
+                        adjusted_chunk = chunk.copy()
+                        adjusted_chunk["offset"] += current_offset * 10000  # Convert ms to 100ns units
+                        word_boundaries.append(adjusted_chunk)
+                # Process word boundaries for subtitles
+                if word_boundaries:
+                    # Group words into phrases for subtitles
+                    phrases = []
+                    current_phrase = []
+                    current_text = ""
+                    phrase_start = 0
+                    for j, boundary in enumerate(word_boundaries):
+                        word = boundary["text"]
+                        start_time = boundary["offset"] / 10000
+                        duration = boundary["duration"] / 10000
+                        end_time = start_time + duration
+                        if not current_phrase:
+                            phrase_start = start_time
+                        current_phrase.append(boundary)
+                        if word in ['.', ',', '!', '?', ':', ';'] or word.startswith(('.', ',', '!', '?', ':', ';')):
+                            current_text = current_text.rstrip() + word + " "
+                        else:
+                            current_text += word + " "
+                        # Determine if we should end this phrase
+                        should_break = False
+                        if word.endswith(('.', '!', '?', ':', ';', ',')) or j == len(word_boundaries) - 1:
+                            should_break = True
+                        elif len(current_phrase) >= 5:
+                            should_break = True
+                        elif j < len(word_boundaries) - 1:
+                            next_start = word_boundaries[j + 1]["offset"] / 10000
+                            if next_start - end_time > 300:
+                                should_break = True
+                        if should_break or j == len(word_boundaries) - 1:
+                            if current_phrase:
+                                last_boundary = current_phrase[-1]
+                                phrase_end = (last_boundary["offset"] + last_boundary["duration"]) / 10000
+                                phrases.append({
+                                    "text": f"[{speaker}] {current_text.strip()}",
+                                    "start": phrase_start,
+                                    "end": phrase_end
+                                })
+                                subtitle_entries.extend(phrases)
+                                current_phrase = []
+                                current_text = ""
+            else:
+                # Simple audio generation without subtitles
+                await communicate.save(segment_file)
+            # Get duration of the generated audio
+            from pydub import AudioSegment
+            audio = AudioSegment.from_file(segment_file)
+            duration = len(audio)
+            audio_segments.append({
+                'file': segment_file,
+                'duration': duration
+            })
+            # Update the current offset for the next segment
+            current_offset += duration
+        # Combine all audio segments
+        from pydub import AudioSegment
+        combined = AudioSegment.empty()
+        for segment in audio_segments:
+            audio = AudioSegment.from_file(segment['file'])
+            combined += audio
+        combined.export(final_audio_path, format="mp3")
+        # Generate subtitles file if requested
+        if generate_subtitles and subtitle_path:
+            with open(subtitle_path, "w", encoding="utf-8") as f:
+                for i, entry in enumerate(subtitle_entries):
+                    f.write(f"{i+1}\n")
+                    f.write(f"{format_time(entry['start'])} --> {format_time(entry['end'])}\n")
+                    f.write(f"{entry['text']}\n\n")
+    return final_audio_path, subtitle_path, None
+async def multi_speaker_interface(text, generate_subtitles, speaker1_voice, speaker1_rate, speaker1_pitch,
+                                  speaker2_voice, speaker2_rate, speaker2_pitch):
+    """Interface function for multi-speaker TTS"""
+    # Create speaker settings from individual parameters
+    speaker_settings = []
+    # Add Speaker 1 if voice is selected
+    if speaker1_voice:
+        speaker_settings.append({
+            'voice': speaker1_voice,
+            'rate': speaker1_rate,
+            'pitch': speaker1_pitch
+        })
+    # Add Speaker 2 if voice is selected
+    if speaker2_voice:
+        speaker_settings.append({
+            'voice': speaker2_voice,
+            'rate': speaker2_rate,
+            'pitch': speaker2_pitch
+        })
+    if not speaker_settings:
+        return None, None, gr.Warning("Please select at least one speaker voice.")
+    audio, subtitle, warning = await multi_speaker_tts(text, speaker_settings, generate_subtitles)
+    if warning:
+        return audio, subtitle, gr.Warning(warning)
+    return audio, subtitle, None
 async def create_demo():
     voices = await get_voices()
     features = """
     ## ✨ Latest Features
+    - **Single & Multi-Speaker Support**: Choose between single speaker or multi-speaker modes
     - **SRT Subtitle Support**: Upload SRT files or input SRT format text to generate perfectly synchronized speech
     - **SRT Generation**: Create subtitle files alongside your audio for perfect timing
     - **File Upload**: Easily upload TXT or SRT files for conversion
         gr.Markdown(description)
         gr.Markdown(features)
+        with gr.Tabs() as tabs:
+            with gr.Tab("Single Speaker"):
+                with gr.Row():
+                    with gr.Column(scale=3):
+                        text_input = gr.Textbox(label="Input Text", lines=5, value="Hello, how are you doing!")
+                        file_input = gr.File(label="Or upload a TXT/SRT file", file_types=[".txt", ".srt"])
+                    with gr.Column(scale=2):
+                        voice_dropdown = gr.Dropdown(
+                            choices=[""] + list(voices.keys()),
+                            label="Select Voice",
+                            value=list(voices.keys())[0] if voices else "",
+                        )
+                        rate_slider = gr.Slider(
+                            minimum=-50,
+                            maximum=50,
+                            value=0,
+                            label="Speech Rate Adjustment (%)",
+                            step=1,
+                        )
+                        pitch_slider = gr.Slider(
+                            minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1
+                        )
+                        subtitle_checkbox = gr.Checkbox(label="Generate Subtitles (.srt)", value=False)
+                        gr.Markdown("""
+                            **📝 Subtitle Timing Tip:**
+                            When creating SRT files for continuous speech, avoid exact matching timestamps between segments.
+                            **For smoother speech flow:**
+                            ```
+                            1
+                            00:00:00,112 --> 00:00:01,647
+                            Hello how are you doing
+                            2
+                            00:00:01,617 --> 00:00:02,000
+                            I'm fine
+                            ```
+                            ✅ Create a small overlap (20-30ms) between segments to prevent pauses
+                            ❌ Avoid exact matching timestamps (where end time = next start time) except you want a pause
+                        """)
+                submit_single_btn = gr.Button("Convert to Speech", variant="primary")
+                warning_single_md = gr.Markdown(visible=False)
+                single_outputs = [
+                    gr.Audio(label="Generated Audio", type="filepath"),
+                    gr.File(label="Generated Subtitles"),
+                    warning_single_md
+                ]
+                # Handle file upload to update text
+                file_input.change(
+                    fn=update_text_from_file,
+                    inputs=[file_input],
+                    outputs=[text_input, warning_single_md]
                 )
+                # Handle submit button for single speaker
+                submit_single_btn.click(
+                    fn=tts_interface,
+                    api_name="predict",
+                    inputs=[text_input, voice_dropdown, rate_slider, pitch_slider, subtitle_checkbox, file_input],
+                    outputs=single_outputs
                 )
+            with gr.Tab("Multi Speaker"):
+                with gr.Column():
+                    multi_text_input = gr.Textbox(
+                        label="Multi-Speaker Text (Format: 'Speaker1: text' or 'S1: text')",
+                        lines=8,
+                        value="Speaker1: Hello, this is the first speaker.\nSpeaker2: And I'm the second speaker!"
+                    )
+                    multi_subtitle_checkbox = gr.Checkbox(label="Generate Subtitles (.srt)", value=False)
+                    with gr.Row():
+                        with gr.Column():
+                            speaker1_voice = gr.Dropdown(
+                                choices=[""] + list(voices.keys()),
+                                label="Speaker 1 Voice",
+                                value=list(voices.keys())[0] if voices else "",
+                            )
+                            speaker1_rate = gr.Slider(
+                                minimum=-50,
+                                maximum=50,
+                                value=0,
+                                label="Speaker 1 Rate (%)",
+                                step=1,
+                            )
+                            speaker1_pitch = gr.Slider(
+                                minimum=-20,
+                                maximum=20,
+                                value=0,
+                                label="Speaker 1 Pitch (Hz)",
+                                step=1,
+                            )
+                        with gr.Column():
+                            speaker2_voice = gr.Dropdown(
+                                choices=[""] + list(voices.keys()),
+                                label="Speaker 2 Voice",
+                                value=list(voices.keys())[10] if len(voices) > 10 else "",
+                            )
+                            speaker2_rate = gr.Slider(
+                                minimum=-50,
+                                maximum=50,
+                                value=0,
+                                label="Speaker 2 Rate (%)",
+                                step=1,
+                            )
+                            speaker2_pitch = gr.Slider(
+                                minimum=-20,
+                                maximum=20,
+                                value=0,
+                                label="Speaker 2 Pitch (Hz)",
+                                step=1,
+                            )
+                submit_multi_btn = gr.Button("Convert Multi-Speaker to Speech", variant="primary")
+                warning_multi_md = gr.Markdown(visible=False)
+                multi_outputs = [
+                    gr.Audio(label="Generated Audio", type="filepath"),
+                    gr.File(label="Generated Subtitles"),
+                    warning_multi_md
+                ]
+                # Correctly pass the individual Gradio components to the click function
+                submit_multi_btn.click(
+                    fn=multi_speaker_interface,
+                    api_name="predict_multi",
+                    inputs=[
+                        multi_text_input,
+                        multi_subtitle_checkbox,
+                        speaker1_voice,
+                        speaker1_rate,
+                        speaker1_pitch,
+                        speaker2_voice,
+                        speaker2_rate,
+                        speaker2_pitch
+                    ],
+                    outputs=multi_outputs
+                )
+        gr.Markdown("Experience the power of Edge TTS for text-to-speech conversion with support for both single speaker and multi-speaker scenarios!")
     return demo
 if __name__ == "__main__":
     asyncio.run(main())