Spaces:
Running
Running
Upload 3 files
Browse files- README.md +86 -13
- app.py +217 -0
- requirements.txt +2 -0
README.md
CHANGED
|
@@ -1,13 +1,86 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
---
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Microsoft Online Edge TTS & MCP Server
|
| 2 |
+
|
| 3 |
+
A Model Context Protocol server that enables LLMs to convert text to speech using the free [Microsoft Edge TTS](https://github.com/rany2/edge-tts), API and MCP are available.
|
| 4 |
+
### Online Demo
|
| 5 |
+
https://modelscope.cn/studios/redstoneleo/English_Speaking_Practice_Assistant/
|
| 6 |
+
### Available Tools
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
- `list_voices` – Fetches the available voices from Microsoft Edge TTS.
|
| 10 |
+
|
| 11 |
+
- **Returns** (object): A dictionary mapping **display names** (e.g., `"en-US-JennyNeural - Female"`) to their corresponding voice metadata, in the form:
|
| 12 |
+
|
| 13 |
+
```json
|
| 14 |
+
{
|
| 15 |
+
"Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, XiaoxiaoNeural)",
|
| 16 |
+
"ShortName": "zh-CN-XiaoxiaoNeural",
|
| 17 |
+
"Gender": "Female",
|
| 18 |
+
"Locale": "zh-CN",
|
| 19 |
+
"SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3",
|
| 20 |
+
"FriendlyName": "Microsoft Xiaoxiao Online (Natural) - Chinese (Mainland)",
|
| 21 |
+
"Status": "GA",
|
| 22 |
+
"VoiceTag": {
|
| 23 |
+
"ContentCategories": ["News", "Novel"],
|
| 24 |
+
"VoicePersonalities": ["Warm"]
|
| 25 |
+
}
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
- `text_to_speech` – Converts input text into speech audio using Microsoft Edge TTS.
|
| 33 |
+
|
| 34 |
+
- `text` (string, required): The text content to be synthesized into speech.
|
| 35 |
+
|
| 36 |
+
- `voice` (string, required): The selected voice in the format `"ShortName - Gender"`.
|
| 37 |
+
|
| 38 |
+
- `rate` (integer, optional): Speech rate adjustment percentage (e.g., `-20%` for slower, `+20%` for faster). Default: `0` means `+0%`.
|
| 39 |
+
|
| 40 |
+
- `pitch` (integer, optional): Pitch adjustment in Hz. Default: `0` means `+0Hz`.
|
| 41 |
+
|
| 42 |
+
- **Returns** (string): URL to the generated MP3 file.
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
### Prompts
|
| 47 |
+
|
| 48 |
+
- **list_voices**
|
| 49 |
+
- list all available voice display names
|
| 50 |
+
|
| 51 |
+
- **text_to_speech**
|
| 52 |
+
- TTS <your text> with a proper voice
|
| 53 |
+
|
| 54 |
+
## Installation
|
| 55 |
+
|
| 56 |
+
1. ```pip install edge_tts gradio -U```
|
| 57 |
+
2. Download `app.py` to somewhat.
|
| 58 |
+
3. Run ```python app.py```, then the console will show information
|
| 59 |
+
like this
|
| 60 |
+
|
| 61 |
+
* Running on local URL: http://127.0.0.1:7860
|
| 62 |
+
* To create a public link, set `share=True` in `launch()`.
|
| 63 |
+
|
| 64 |
+
🔨 Launching MCP server:
|
| 65 |
+
** Streamable HTTP URL: http://127.0.0.1:7860/gradio_api/mcp/
|
| 66 |
+
* [Deprecated] SSE URL: http://127.0.0.1:7860/gradio_api/mcp/sse
|
| 67 |
+
|
| 68 |
+
## Configuration
|
| 69 |
+
|
| 70 |
+
You can find the exact config to copy-paste by going to the "View API" link in the footer of your Gradio app, and then clicking on "MCP".
|
| 71 |
+

|
| 72 |
+
For clients that support SSE (e.g. Cursor, Windsurf, Cline), simply add the following configuration to your MCP config, for detailed steps please refer [here](https://www.gradio.app/guides/using-docs-mcp#installing-in-the-clients).
|
| 73 |
+
```json
|
| 74 |
+
{
|
| 75 |
+
"mcpServers": {
|
| 76 |
+
"Edge TTS": {
|
| 77 |
+
"url": "http://127.0.0.1:7860/gradio_api/mcp/"
|
| 78 |
+
}
|
| 79 |
+
}
|
| 80 |
+
}
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
## License
|
| 85 |
+
|
| 86 |
+
mcp-server-fetch is licensed under the MIT License. This means you are free to use, modify, and distribute the software, subject to the terms and conditions of the MIT License. For more details, please see the LICENSE file in the project repository.
|
app.py
ADDED
|
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import edge_tts
|
| 3 |
+
import aiohttp
|
| 4 |
+
|
| 5 |
+
import asyncio
|
| 6 |
+
import tempfile, time
|
| 7 |
+
from typing import TypedDict, List, Dict
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class VoiceTag(TypedDict, total=False):
|
| 13 |
+
ContentCategories: List[str]
|
| 14 |
+
VoicePersonalities: List[str]
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class VoiceMetadata(TypedDict, total=False):
|
| 18 |
+
Name: str
|
| 19 |
+
ShortName: str
|
| 20 |
+
Gender: str
|
| 21 |
+
Locale: str
|
| 22 |
+
SuggestedCodec: str
|
| 23 |
+
FriendlyName: str
|
| 24 |
+
Status: str
|
| 25 |
+
VoiceTag: VoiceTag
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
async def list_voices() -> Dict[str, VoiceMetadata]:
|
| 29 |
+
"""
|
| 30 |
+
Fetch available voices from Microsoft Edge TTS.
|
| 31 |
+
|
| 32 |
+
Returns:
|
| 33 |
+
Dict[str, VoiceMetadata]: A dictionary mapping display names
|
| 34 |
+
(e.g. "en-US-JennyNeural - Female") to their corresponding in the form like { 'Name': 'Microsoft Server Speech Text to Speech Voice (zh-CN, XiaoxiaoNeural)', 'ShortName': 'zh-CN-XiaoxiaoNeural', 'Gender': 'Female', 'Locale': 'zh-CN', 'SuggestedCodec': 'audio-24khz-48kbitrate-mono-mp3', 'FriendlyName': 'Microsoft Xiaoxiao Online (Natural) - Chinese (Mainland)', 'Status': 'GA', 'VoiceTag': { 'ContentCategories': ['News', 'Novel'], 'VoicePersonalities': ['Warm'] } }
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
priority_order = [
|
| 40 |
+
"zh", # 中文优先
|
| 41 |
+
"en-US", # 美式英语
|
| 42 |
+
"en-GB", # 英式英语
|
| 43 |
+
"en", # 其他英语
|
| 44 |
+
"hi", # 印地语
|
| 45 |
+
"es", # 西班牙语
|
| 46 |
+
"ar", # 现代标准阿拉伯语
|
| 47 |
+
"fr", # 法语
|
| 48 |
+
"bn", # 孟加拉语
|
| 49 |
+
"pt", # 葡萄牙语
|
| 50 |
+
"ru", # 俄语
|
| 51 |
+
"id", # 印尼语
|
| 52 |
+
"ur", # 乌尔都语
|
| 53 |
+
"de", # 德语
|
| 54 |
+
"ja", # 日语
|
| 55 |
+
"pcm", # 尼日利亚皮钦语
|
| 56 |
+
"ar-EG", # 埃及阿拉伯语
|
| 57 |
+
"mr", # 马拉地语
|
| 58 |
+
"vi", # 越南语
|
| 59 |
+
"te", # 泰卢固语
|
| 60 |
+
"ha", # 豪萨语
|
| 61 |
+
"tr", # 土耳其语
|
| 62 |
+
"pa", # 西旁遮普语
|
| 63 |
+
"sw", # 斯瓦希里语
|
| 64 |
+
"fil", # 他加禄语
|
| 65 |
+
"ta", # 泰米尔语
|
| 66 |
+
"yue", # 粤语
|
| 67 |
+
"wuu", # 吴语
|
| 68 |
+
"fa", # 波斯语
|
| 69 |
+
"ko", # 韩语
|
| 70 |
+
"th", # 泰语
|
| 71 |
+
"jv", # 爪哇语
|
| 72 |
+
]
|
| 73 |
+
|
| 74 |
+
lang_priority = {lang: i for i, lang in enumerate(priority_order)}
|
| 75 |
+
|
| 76 |
+
def get_priority(short_name: str) -> tuple[int, str]:
|
| 77 |
+
"""
|
| 78 |
+
Return (priority_rank, short_name) for stable sorting.
|
| 79 |
+
"""
|
| 80 |
+
for prefix, rank in lang_priority.items():
|
| 81 |
+
if short_name.startswith(prefix):
|
| 82 |
+
return (rank, short_name)
|
| 83 |
+
return (len(priority_order), short_name) # 其他语言排最后,内部按 short_name
|
| 84 |
+
|
| 85 |
+
n = 5
|
| 86 |
+
for i in range(n):
|
| 87 |
+
try:
|
| 88 |
+
voices = await edge_tts.list_voices()
|
| 89 |
+
break
|
| 90 |
+
except Exception as e:
|
| 91 |
+
time.sleep(1 * i)
|
| 92 |
+
print("Retrying due to handshake error:", e)
|
| 93 |
+
if i == n - 1:
|
| 94 |
+
raise e
|
| 95 |
+
|
| 96 |
+
# ✅ 先按优先级,再按 short_name 排序,保证同类不乱
|
| 97 |
+
voices_sorted = sorted(voices, key=lambda v: get_priority(v["ShortName"]))
|
| 98 |
+
|
| 99 |
+
return {f"{v['ShortName']} - {v['Gender']}": v for v in voices_sorted}
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
async def text_to_speech(text: str, voice: str, rate: int = 0, pitch: int = 0) -> str:
|
| 103 |
+
"""
|
| 104 |
+
Convert input text to speech using Microsoft Edge TTS.
|
| 105 |
+
|
| 106 |
+
Args:
|
| 107 |
+
text (str): The text to synthesize into speech.
|
| 108 |
+
voice (str): The selected voice in the format "ShortName - Gender".
|
| 109 |
+
rate (int, optional): Speech rate adjustment percentage. Default is 0.
|
| 110 |
+
pitch (int, optional): Pitch adjustment in Hz. Default is 0.
|
| 111 |
+
|
| 112 |
+
Returns:
|
| 113 |
+
str: Path to the generated MP3 file.
|
| 114 |
+
"""
|
| 115 |
+
|
| 116 |
+
voice_short_name = voice.split(" - ")[0]
|
| 117 |
+
rate_str = f"{rate:+d}%"
|
| 118 |
+
pitch_str = f"{pitch:+d}Hz"
|
| 119 |
+
|
| 120 |
+
n = 5
|
| 121 |
+
for i in range(n):
|
| 122 |
+
try:
|
| 123 |
+
communicate = edge_tts.Communicate(
|
| 124 |
+
text, voice_short_name, rate=rate_str, pitch=pitch_str
|
| 125 |
+
)
|
| 126 |
+
break
|
| 127 |
+
except aiohttp.client_exceptions.WSServerHandshakeError as e:
|
| 128 |
+
time.sleep(1 * i)
|
| 129 |
+
print("Retrying due to handshake error:", e)
|
| 130 |
+
if i == n - 1:
|
| 131 |
+
raise e
|
| 132 |
+
|
| 133 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
|
| 134 |
+
tmp_path = tmp_file.name
|
| 135 |
+
await communicate.save(tmp_path)
|
| 136 |
+
|
| 137 |
+
return tmp_path#后面output到gr.Audio上,通过程序交互的时候gradio会将其转化为url,很神奇!
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
i18n = gr.I18n(
|
| 141 |
+
en={"pageTitle": "# 🎙️ Microsoft Online Edge TTS & MCP Server",
|
| 142 |
+
"generateSpeech": "Generate Speech",
|
| 143 |
+
'description':"""Convert text to speech using the free [Microsoft Edge TTS](https://github.com/rany2/edge-tts), API and MCP are available. Made by [MathJoy](https://www.cnblogs.com/imath).""",
|
| 144 |
+
"selectVoice":"Select Voice",
|
| 145 |
+
"speechRateAdjustment":"Speech Rate Adjustment (%)",
|
| 146 |
+
"pitchAdjustment":"Pitch Adjustment (Hz)",
|
| 147 |
+
"inputText":"Input Text"
|
| 148 |
+
},
|
| 149 |
+
zh={"pageTitle": "# 🎙️ 微软线上 Edge TTS & MCP",
|
| 150 |
+
"generateSpeech": "生成语音",
|
| 151 |
+
'description':"""文本语音合成,基于免费的[Microsoft Edge TTS](https://github.com/rany2/edge-tts),由[MathJoy](https://www.cnblogs.com/imath)精心制作,也可通过API或MCP来使用。""",
|
| 152 |
+
"selectVoice":"选择音色",
|
| 153 |
+
"speechRateAdjustment":"语速调整 (%)",
|
| 154 |
+
"pitchAdjustment":"音量调整 (Hz)",
|
| 155 |
+
"inputText":"输入文字"
|
| 156 |
+
}
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
async def create_UI():
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
voices = await list_voices()
|
| 163 |
+
|
| 164 |
+
with gr.Blocks(title="Microsoft Edge TTS & MCP",analytics_enabled=False) as UI:
|
| 165 |
+
gr.api(#默认只在fn处暴露工具,这里主动暴露这个tool
|
| 166 |
+
list_voices
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
gr.Markdown(i18n("pageTitle"))
|
| 170 |
+
|
| 171 |
+
with gr.Row():
|
| 172 |
+
with gr.Column(scale=1):
|
| 173 |
+
gr.Markdown(i18n("description"))
|
| 174 |
+
|
| 175 |
+
with gr.Row():
|
| 176 |
+
with gr.Column():
|
| 177 |
+
text_input = gr.Textbox(label=i18n("inputText"), lines=5,value='Your text here')
|
| 178 |
+
|
| 179 |
+
voiceList=list(voices.keys())
|
| 180 |
+
voice_dropdown = gr.Dropdown(
|
| 181 |
+
choices=voiceList,
|
| 182 |
+
label=i18n("selectVoice"),
|
| 183 |
+
value=voiceList[0])
|
| 184 |
+
|
| 185 |
+
rate_slider = gr.Slider(
|
| 186 |
+
minimum=-50, maximum=50, value=0,
|
| 187 |
+
label=i18n("speechRateAdjustment"), step=1
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
pitch_slider = gr.Slider(
|
| 191 |
+
minimum=-20, maximum=20, value=0,
|
| 192 |
+
label=i18n("pitchAdjustment"), step=1
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
generate_btn = gr.Button(i18n("generateSpeech"), variant="primary")
|
| 196 |
+
audio_output = gr.Audio(label="Generated Audio", type="filepath",autoplay=True)
|
| 197 |
+
|
| 198 |
+
generate_btn.click(
|
| 199 |
+
fn=text_to_speech,
|
| 200 |
+
inputs=[text_input, voice_dropdown, rate_slider, pitch_slider],
|
| 201 |
+
outputs=[audio_output]
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
return UI
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
async def main():
|
| 208 |
+
UI = await create_UI()
|
| 209 |
+
UI.queue(default_concurrency_limit=50)
|
| 210 |
+
UI.launch(
|
| 211 |
+
# show_api=False,
|
| 212 |
+
i18n=i18n,
|
| 213 |
+
mcp_server=True # ✅ make this an MCP server
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
if __name__ == "__main__":
|
| 217 |
+
asyncio.run(main())
|
requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
edge_tts
|
| 2 |
+
gradio>=5.43.1
|