yukee1992 commited on
Commit
be33088
·
verified ·
1 Parent(s): 77fe1f0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +198 -72
app.py CHANGED
@@ -1,108 +1,234 @@
1
  import gradio as gr
2
- import ChatTTS
3
- import torch
4
- import torchaudio
5
- import numpy as np
6
  import os
 
7
  from pathlib import Path
8
 
9
- # Initialize model
10
- chat = ChatTTS.Chat()
11
- chat.load(compile=False) # Set to True for better performance
12
-
13
- # Voice profile mapping (0-4)
14
- VOICE_PROFILES = {
15
- 0: "loyal_sister", # Warm, caring tone
16
- 1: "sweet_voice", # Gentle, melodic
17
- 2: "cool_voice", # Calm, composed
18
- 3: "loli_voice", # High-pitched, youthful
19
- 4: "professional", # Neutral, clear
20
  }
21
 
22
- # Emotion control mapping [citation:2]
23
- EMOTION_CONTROLS = {
24
- 0: "[oral_0][laugh_0][break_0]", # Neutral
25
- 1: "[oral_6][laugh_2][break_4]", # Happy
26
- 2: "[oral_2][laugh_0][break_6]", # Sad
27
- 3: "[oral_8][laugh_1][break_2]", # Excited
28
- 4: "[oral_7][laugh_0][break_5]", # Frustrated
29
  }
30
 
31
- def generate_speaker(voice_id):
32
- """Generate consistent speaker embedding"""
33
- seed_map = {0: 42, 1: 123, 2: 256, 3: 389, 4: 512}
34
- torch.manual_seed(seed_map.get(voice_id, 42))
35
- return chat.sample_random_speaker()
 
 
 
 
 
 
36
 
37
- def tts_generate(text, voice_id, emotion_id, speed=1.0):
38
- """Generate speech with controlled voice and emotion"""
 
 
 
 
 
 
 
 
39
  try:
40
- # Get speaker embedding
41
- spk_emb = generate_speaker(voice_id)
42
 
43
- # Configure inference parameters
44
- params_infer_code = ChatTTS.Chat.InferCodeParams(
45
- spk_emb=spk_emb,
46
- temperature=0.3 * speed,
47
- top_P=0.7,
48
- top_K=20,
49
- )
50
 
51
- # Configure emotion
52
- params_refine_text = ChatTTS.Chat.RefineTextParams(
53
- prompt=EMOTION_CONTROLS.get(emotion_id, "[oral_0][laugh_0][break_0]"),
 
 
 
 
 
 
 
 
 
54
  )
55
 
56
- # Generate speech
57
- wavs = chat.infer([text],
58
- params_refine_text=params_refine_text,
59
- params_infer_code=params_infer_code)
60
 
61
- # Save audio
62
- audio_tensor = torch.from_numpy(wavs[0]).unsqueeze(0)
63
- output_path = "output.wav"
64
- torchaudio.save(output_path, audio_tensor, 24000)
65
 
 
66
  return output_path, {
67
  "success": True,
68
- "voice": VOICE_PROFILES[voice_id],
69
- "emotion": emotion_id,
70
- "speed": speed
 
 
 
 
 
 
71
  }
 
72
  except Exception as e:
73
- return None, {"success": False, "error": str(e)}
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  # Create Gradio interface
76
- with gr.Blocks(title="Chinese TTS API", theme=gr.themes.Soft()) as demo:
77
  gr.Markdown("""
78
  # 🎙️ Chinese TTS API for n8n
79
- Control voice and emotion via numeric parameters
 
 
 
 
 
 
80
  """)
81
 
82
  with gr.Row():
83
- with gr.Column():
84
  text_input = gr.Textbox(
85
- label="Text (支持中文)",
86
- placeholder="输入文本...",
87
- lines=3
 
88
  )
89
- voice_id = gr.Slider(0, 4, step=1, value=1,
90
- label="Voice ID (0: Sister, 1: Sweet, 2: Cool, 3: Loli, 4: Professional)")
91
- emotion_id = gr.Slider(0, 4, step=1, value=0,
92
- label="Emotion ID (0: Neutral, 1: Happy, 2: Sad, 3: Excited, 4: Frustrated)")
93
- speed = gr.Slider(0.5, 2.0, step=0.1, value=1.0,
94
- label="Speed")
95
- generate_btn = gr.Button("Generate", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
- with gr.Column():
98
- audio_output = gr.Audio(label="Generated Audio", type="filepath")
99
- json_output = gr.JSON(label="Response")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  generate_btn.click(
102
- fn=tts_generate,
103
- inputs=[text_input, voice_id, emotion_id, speed],
104
  outputs=[audio_output, json_output]
105
  )
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  if __name__ == "__main__":
108
- demo.queue(max_size=50).launch(server_name="0.0.0.0", server_port=7860)
 
 
 
 
 
1
  import gradio as gr
2
+ import asyncio
3
+ import edge_tts
4
+ import tempfile
 
5
  import os
6
+ import json
7
  from pathlib import Path
8
 
9
+ # Chinese voice options with different characteristics
10
+ VOICE_MAPPING = {
11
+ 0: "zh-CN-XiaoxiaoNeural", # Loyal Sister - Gentle, warm
12
+ 1: "zh-CN-XiaoyiNeural", # Sweet Voice - Lively, cute
13
+ 2: "zh-CN-YunjianNeural", # Cool Voice - Deep, calm
14
+ 3: "zh-CN-XiaomengNeural", # Loli Voice - Childish, energetic
15
+ 4: "zh-CN-YunxiNeural", # Professional - Clear, broadcast
 
 
 
 
16
  }
17
 
18
+ # Voice style descriptions
19
+ VOICE_DESCRIPTIONS = {
20
+ 0: "Loyal Sister (Xiaoxiao) - Warm, caring",
21
+ 1: "Sweet Voice (Xiaoyi) - Lively, cute",
22
+ 2: "Cool Voice (Yunjian) - Deep, calm",
23
+ 3: "Loli Voice (Xiaomeng) - Childish, energetic",
24
+ 4: "Professional (Yunxi) - Clear, broadcast"
25
  }
26
 
27
+ # Emotion mapping through speech rate and pitch
28
+ def get_emotion_params(emotion_id):
29
+ """Convert emotion ID to speech parameters"""
30
+ emotions = {
31
+ 0: {"rate": "+0%", "pitch": "+0Hz", "volume": "+0%"}, # Neutral
32
+ 1: {"rate": "+15%", "pitch": "+30Hz", "volume": "+10%"}, # Happy
33
+ 2: {"rate": "-10%", "pitch": "-20Hz", "volume": "-10%"}, # Sad
34
+ 3: {"rate": "+25%", "pitch": "+50Hz", "volume": "+15%"}, # Excited
35
+ 4: {"rate": "+5%", "pitch": "+15Hz", "volume": "+5%"}, # Frustrated
36
+ }
37
+ return emotions.get(emotion_id, emotions[0])
38
 
39
+ async def generate_speech(text, voice_id, emotion_id, speed=1.0):
40
+ """
41
+ Generate speech using Edge TTS
42
+
43
+ Args:
44
+ text: Text to synthesize (Chinese or English)
45
+ voice_id: 0-4 for different voice types
46
+ emotion_id: 0-4 for different emotions
47
+ speed: Speech rate multiplier
48
+ """
49
  try:
50
+ # Get voice
51
+ voice = VOICE_MAPPING.get(voice_id, "zh-CN-XiaoxiaoNeural")
52
 
53
+ # Get emotion parameters
54
+ emotion_params = get_emotion_params(emotion_id)
 
 
 
 
 
55
 
56
+ # Adjust rate based on speed
57
+ rate_percentage = int(emotion_params["rate"].replace("%", "").replace("+", ""))
58
+ adjusted_rate = rate_percentage + int((speed - 1.0) * 50)
59
+ rate = f"{adjusted_rate:+d}%"
60
+
61
+ # Create communicate object with parameters
62
+ communicate = edge_tts.Communicate(
63
+ text,
64
+ voice,
65
+ rate=rate,
66
+ pitch=emotion_params["pitch"],
67
+ volume=emotion_params["volume"]
68
  )
69
 
70
+ # Generate audio to temporary file
71
+ temp_dir = tempfile.mkdtemp()
72
+ output_path = os.path.join(temp_dir, "output.mp3")
 
73
 
74
+ await communicate.save(output_path)
 
 
 
75
 
76
+ # Return audio file path and metadata
77
  return output_path, {
78
  "success": True,
79
+ "voice": VOICE_DESCRIPTIONS[voice_id],
80
+ "voice_id": voice_id,
81
+ "emotion_id": emotion_id,
82
+ "speed": speed,
83
+ "parameters": {
84
+ "rate": rate,
85
+ "pitch": emotion_params["pitch"],
86
+ "volume": emotion_params["volume"]
87
+ }
88
  }
89
+
90
  except Exception as e:
91
+ return None, {
92
+ "success": False,
93
+ "error": str(e)
94
+ }
95
+
96
+ def tts_wrapper(text, voice_id, emotion_id, speed):
97
+ """Wrapper function to handle async"""
98
+ loop = asyncio.new_event_loop()
99
+ asyncio.set_event_loop(loop)
100
+ audio_path, metadata = loop.run_until_complete(
101
+ generate_speech(text, voice_id, emotion_id, speed)
102
+ )
103
+ return audio_path, metadata
104
 
105
  # Create Gradio interface
106
+ with gr.Blocks(title="Chinese TTS API for n8n", theme=gr.themes.Soft()) as demo:
107
  gr.Markdown("""
108
  # 🎙️ Chinese TTS API for n8n
109
+ ### Stable Edge TTS backend with voice and emotion control
110
+
111
+ | Parameter | Range | Description |
112
+ |-----------|-------|-------------|
113
+ | Voice ID | 0-4 | Different voice characteristics |
114
+ | Emotion ID | 0-4 | Emotional expression |
115
+ | Speed | 0.5-2.0 | Speech rate |
116
  """)
117
 
118
  with gr.Row():
119
+ with gr.Column(scale=1):
120
  text_input = gr.Textbox(
121
+ label="📝 Text (支持中文/English)",
122
+ placeholder="输入要转换的...",
123
+ lines=4,
124
+ value="你好,欢迎使用语音合成服务。"
125
  )
126
+
127
+ with gr.Row():
128
+ voice_slider = gr.Slider(
129
+ minimum=0, maximum=4, step=1, value=1,
130
+ label="Voice ID (0-4)"
131
+ )
132
+ voice_preview = gr.Markdown("**Selected:** Sweet Voice (Xiaoyi)")
133
+
134
+ with gr.Row():
135
+ emotion_slider = gr.Slider(
136
+ minimum=0, maximum=4, step=1, value=0,
137
+ label="Emotion ID (0-4)"
138
+ )
139
+ emotion_preview = gr.Markdown("**Selected:** Neutral")
140
+
141
+ speed_slider = gr.Slider(
142
+ minimum=0.5, maximum=2.0, step=0.1, value=1.0,
143
+ label="Speed"
144
+ )
145
+
146
+ generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
147
 
148
+ with gr.Column(scale=1):
149
+ audio_output = gr.Audio(
150
+ label="Generated Audio",
151
+ type="filepath"
152
+ )
153
+ json_output = gr.JSON(
154
+ label="Response Data (for n8n)"
155
+ )
156
+
157
+ # Voice reference table
158
+ gr.Markdown("""
159
+ ### Voice Reference
160
+
161
+ | ID | Voice | Description |
162
+ |----|-------|-------------|
163
+ | 0 | Xiaoxiao | Loyal Sister - Warm, caring |
164
+ | 1 | Xiaoyi | Sweet Voice - Lively, cute |
165
+ | 2 | Yunjian | Cool Voice - Deep, calm |
166
+ | 3 | Xiaomeng | Loli Voice - Childish |
167
+ | 4 | Yunxi | Professional - Clear |
168
+
169
+ ### Emotion Reference
170
+
171
+ | ID | Emotion | Effect |
172
+ |----|---------|--------|
173
+ | 0 | Neutral | Normal speech |
174
+ | 1 | Happy | Higher pitch, faster |
175
+ | 2 | Sad | Lower pitch, slower |
176
+ | 3 | Excited | High energy, fast |
177
+ | 4 | Frustrated | Tense, emphasized |
178
+ """)
179
 
180
+ # Update previews when sliders change
181
+ def update_voice_preview(voice_id):
182
+ return f"**Selected:** {VOICE_DESCRIPTIONS[voice_id]}"
183
+
184
+ def update_emotion_preview(emotion_id):
185
+ emotions = ["Neutral", "Happy", "Sad", "Excited", "Frustrated"]
186
+ return f"**Selected:** {emotions[emotion_id]}"
187
+
188
+ voice_slider.change(
189
+ fn=update_voice_preview,
190
+ inputs=voice_slider,
191
+ outputs=voice_preview
192
+ )
193
+
194
+ emotion_slider.change(
195
+ fn=update_emotion_preview,
196
+ inputs=emotion_slider,
197
+ outputs=emotion_preview
198
+ )
199
+
200
+ # Generate button click
201
  generate_btn.click(
202
+ fn=tts_wrapper,
203
+ inputs=[text_input, voice_slider, emotion_slider, speed_slider],
204
  outputs=[audio_output, json_output]
205
  )
206
 
207
+ # For API mode (used by n8n)
208
+ async def api_generate(params):
209
+ """API endpoint for n8n"""
210
+ text = params.get("text", "")
211
+ voice_id = int(params.get("voice_id", 1))
212
+ emotion_id = int(params.get("emotion_id", 0))
213
+ speed = float(params.get("speed", 1.0))
214
+
215
+ audio_path, metadata = await generate_speech(text, voice_id, emotion_id, speed)
216
+
217
+ if metadata["success"]:
218
+ return {
219
+ "status": "success",
220
+ "audio_url": f"/file={audio_path}",
221
+ "metadata": metadata
222
+ }
223
+ else:
224
+ return {
225
+ "status": "error",
226
+ "error": metadata["error"]
227
+ }
228
+
229
  if __name__ == "__main__":
230
+ demo.queue(max_size=50).launch(
231
+ server_name="0.0.0.0",
232
+ server_port=7860,
233
+ show_error=True
234
+ )