File size: 3,410 Bytes
f08f936
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a44a43b
f08f936
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import asyncio
import tempfile

import edge_tts
import gradio as gr


async def get_voices():
    """Fetch the full list of Edge TTS voices, keyed by a human-readable label."""
    voices = await edge_tts.list_voices()
    voices.sort(key=lambda v: (v["Locale"], v["ShortName"]))
    return {
        f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v["ShortName"]
        for v in voices
    }


async def text_to_speech(text, voice, rate, pitch):
    """Synthesize `text` with the given voice/rate/pitch and return an mp3 path."""
    if not text.strip():
        return None, "Please enter some text to convert."
    if not voice:
        return None, "Please select a voice."

    voice_short_name = voice.split(" - ")[0]
    rate_str = f"{int(rate):+d}%"
    pitch_str = f"{int(pitch):+d}Hz"

    communicate = edge_tts.Communicate(
        text, voice_short_name, rate=rate_str, pitch=pitch_str
    )

    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        tmp_path = tmp_file.name
    await communicate.save(tmp_path)

    return tmp_path, None


async def tts_interface(text, voice, rate, pitch):
    audio, warning = await text_to_speech(text, voice, rate, pitch)
    if warning:
        gr.Warning(warning)
        return None
    return audio


async def create_demo():
    voices = await get_voices()
    voice_labels = list(voices.keys())

    # Pick a sensible default voice if one is available.
    default_voice = next(
        (label for label in voice_labels if label.startswith("en-US-AriaNeural")),
        voice_labels[0] if voice_labels else "",
    )

    with gr.Blocks(analytics_enabled=False, title="Edge TTS Text-to-Speech") as demo:
        gr.Markdown("# 🎙️ Edge TTS Text-to-Speech")
        gr.Markdown(
            "Convert text to speech using Microsoft Edge's online TTS voices. "
            "Adjust rate and pitch as percentages/Hz offsets from the default "
            "(0 = unchanged, positive = faster/higher, negative = slower/lower)."
        )

        with gr.Row():
            with gr.Column():
                text_input = gr.Textbox(
                    label="Input Text",
                    lines=6,
                    placeholder="Type or paste the text you want to hear...",
                )
                voice_dropdown = gr.Dropdown(
                    choices=voice_labels,
                    label="Voice",
                    value=default_voice,
                    filterable=True,
                )
                rate_slider = gr.Slider(
                    minimum=-50, maximum=50, value=0, step=1,
                    label="Speech Rate Adjustment (%)",
                )
                pitch_slider = gr.Slider(
                    minimum=-20, maximum=20, value=0, step=1,
                    label="Pitch Adjustment (Hz)",
                )
                generate_btn = gr.Button("Generate Speech", variant="primary")

            with gr.Column():
                audio_output = gr.Audio(label="Generated Audio", type="filepath")

        generate_btn.click(
            fn=tts_interface,
            inputs=[text_input, voice_dropdown, rate_slider, pitch_slider],
            outputs=audio_output,
        )

    return demo


async def main():
    demo = await create_demo()
    demo.queue(default_concurrency_limit=20)
    demo.launch()


if __name__ == "__main__":
    asyncio.run(main())