yukee1992 commited on
Commit
f388252
Β·
verified Β·
1 Parent(s): 477e22f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +166 -59
app.py CHANGED
@@ -5,17 +5,27 @@ import tempfile
5
  import os
6
  import json
7
  from pathlib import Path
 
 
 
 
8
 
9
- # Chinese voice options with different characteristics
 
 
 
 
 
 
 
10
  VOICE_MAPPING = {
11
- 0: "zh-CN-XiaoxiaoNeural", # Loyal Sister - Gentle, warm
12
- 1: "zh-CN-XiaoyiNeural", # Sweet Voice - Lively, cute
13
- 2: "zh-CN-YunjianNeural", # Cool Voice - Deep, calm
14
- 3: "zh-CN-XiaomengNeural", # Loli Voice - Childish, energetic
15
- 4: "zh-CN-YunxiNeural", # Professional - Clear, broadcast
16
  }
17
 
18
- # Voice style descriptions
19
  VOICE_DESCRIPTIONS = {
20
  0: "Loyal Sister (Xiaoxiao) - Warm, caring",
21
  1: "Sweet Voice (Xiaoyi) - Lively, cute",
@@ -24,7 +34,6 @@ VOICE_DESCRIPTIONS = {
24
  4: "Professional (Yunxi) - Clear, broadcast"
25
  }
26
 
27
- # Emotion mapping through speech rate and pitch
28
  def get_emotion_params(emotion_id):
29
  """Convert emotion ID to speech parameters"""
30
  emotions = {
@@ -36,15 +45,99 @@ def get_emotion_params(emotion_id):
36
  }
37
  return emotions.get(emotion_id, emotions[0])
38
 
39
- async def generate_speech(text, voice_id, emotion_id, speed=1.0):
40
  """
41
- Generate speech using Edge TTS
42
 
43
  Args:
44
- text: Text to synthesize (Chinese or English)
45
- voice_id: 0-4 for different voice types
46
- emotion_id: 0-4 for different emotions
47
- speed: Speech rate multiplier
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  """
49
  try:
50
  # Get voice
@@ -58,7 +151,7 @@ async def generate_speech(text, voice_id, emotion_id, speed=1.0):
58
  adjusted_rate = rate_percentage + int((speed - 1.0) * 50)
59
  rate = f"{adjusted_rate:+d}%"
60
 
61
- # Create communicate object with parameters
62
  communicate = edge_tts.Communicate(
63
  text,
64
  voice,
@@ -69,15 +162,15 @@ async def generate_speech(text, voice_id, emotion_id, speed=1.0):
69
 
70
  # Generate audio to temporary file
71
  temp_dir = tempfile.mkdtemp()
72
- output_path = os.path.join(temp_dir, "output.mp3")
73
 
74
- await communicate.save(output_path)
75
 
76
- # Return audio file path and metadata
77
- return output_path, {
78
- "success": True,
79
- "voice": VOICE_DESCRIPTIONS[voice_id],
80
  "voice_id": voice_id,
 
81
  "emotion_id": emotion_id,
82
  "speed": speed,
83
  "parameters": {
@@ -87,6 +180,33 @@ async def generate_speech(text, voice_id, emotion_id, speed=1.0):
87
  }
88
  }
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  except Exception as e:
91
  return None, {
92
  "success": False,
@@ -103,16 +223,15 @@ def tts_wrapper(text, voice_id, emotion_id, speed):
103
  return audio_path, metadata
104
 
105
  # Create Gradio interface
106
- with gr.Blocks(title="Chinese TTS API for n8n", theme=gr.themes.Soft()) as demo:
107
  gr.Markdown("""
108
- # πŸŽ™οΈ Chinese TTS API for n8n
109
- ### Stable Edge TTS backend with voice and emotion control
110
 
111
- | Parameter | Range | Description |
112
- |-----------|-------|-------------|
113
- | Voice ID | 0-4 | Different voice characteristics |
114
- | Emotion ID | 0-4 | Emotional expression |
115
- | Speed | 0.5-2.0 | Speech rate |
116
  """)
117
 
118
  with gr.Row():
@@ -143,46 +262,31 @@ with gr.Blocks(title="Chinese TTS API for n8n", theme=gr.themes.Soft()) as demo:
143
  label="Speed"
144
  )
145
 
146
- generate_btn = gr.Button("🎡 Generate Speech", variant="primary", size="lg")
147
 
148
  with gr.Column(scale=1):
149
  audio_output = gr.Audio(
150
- label="Generated Audio",
151
  type="filepath"
152
  )
153
  json_output = gr.JSON(
154
- label="Response Data (for n8n)"
155
  )
156
 
157
- # Voice reference table
158
- gr.Markdown("""
159
- ### Voice Reference
160
-
161
- | ID | Voice | Description |
162
- |----|-------|-------------|
163
- | 0 | Xiaoxiao | Loyal Sister - Warm, caring |
164
- | 1 | Xiaoyi | Sweet Voice - Lively, cute |
165
- | 2 | Yunjian | Cool Voice - Deep, calm |
166
- | 3 | Xiaomeng | Loli Voice - Childish |
167
- | 4 | Yunxi | Professional - Clear |
168
-
169
- ### Emotion Reference
170
-
171
- | ID | Emotion | Effect |
172
- |----|---------|--------|
173
- | 0 | Neutral | Normal speech |
174
- | 1 | Happy | Higher pitch, faster |
175
- | 2 | Sad | Lower pitch, slower |
176
- | 3 | Excited | High energy, fast |
177
- | 4 | Frustrated | Tense, emphasized |
178
  """)
179
 
180
- # Update previews when sliders change
181
  def update_voice_preview(voice_id):
182
  return f"**Selected:** {VOICE_DESCRIPTIONS[voice_id]}"
183
 
184
  def update_emotion_preview(emotion_id):
185
- emotions = ["Neutral", "Happy", "Sad", "Excited", "Frustrated"]
186
  return f"**Selected:** {emotions[emotion_id]}"
187
 
188
  voice_slider.change(
@@ -204,9 +308,9 @@ with gr.Blocks(title="Chinese TTS API for n8n", theme=gr.themes.Soft()) as demo:
204
  outputs=[audio_output, json_output]
205
  )
206
 
207
- # For API mode (used by n8n)
208
  async def api_generate(params):
209
- """API endpoint for n8n"""
210
  text = params.get("text", "")
211
  voice_id = int(params.get("voice_id", 1))
212
  emotion_id = int(params.get("emotion_id", 0))
@@ -217,8 +321,11 @@ async def api_generate(params):
217
  if metadata["success"]:
218
  return {
219
  "status": "success",
220
- "audio_url": f"/file={audio_path}",
221
- "metadata": metadata
 
 
 
222
  }
223
  else:
224
  return {
 
5
  import os
6
  import json
7
  from pathlib import Path
8
+ from huggingface_hub import HfApi, upload_file
9
+ import uuid
10
+ from datetime import datetime
11
+ import shutil
12
 
13
+ # Configuration
14
+ HF_TOKEN = os.environ.get("HF_TOKEN") # You'll set this in Space secrets
15
+ DATASET_REPO = os.environ.get("DATASET_REPO", "YOUR_USERNAME/tts-audio-dataset") # Your dataset name
16
+
17
+ # Initialize Hugging Face API
18
+ hf_api = HfApi(token=HF_TOKEN)
19
+
20
+ # Chinese voice options
21
  VOICE_MAPPING = {
22
+ 0: "zh-CN-XiaoxiaoNeural", # Loyal Sister
23
+ 1: "zh-CN-XiaoyiNeural", # Sweet Voice
24
+ 2: "zh-CN-YunjianNeural", # Cool Voice
25
+ 3: "zh-CN-XiaomengNeural", # Loli Voice
26
+ 4: "zh-CN-YunxiNeural", # Professional
27
  }
28
 
 
29
  VOICE_DESCRIPTIONS = {
30
  0: "Loyal Sister (Xiaoxiao) - Warm, caring",
31
  1: "Sweet Voice (Xiaoyi) - Lively, cute",
 
34
  4: "Professional (Yunxi) - Clear, broadcast"
35
  }
36
 
 
37
  def get_emotion_params(emotion_id):
38
  """Convert emotion ID to speech parameters"""
39
  emotions = {
 
45
  }
46
  return emotions.get(emotion_id, emotions[0])
47
 
48
+ def upload_to_dataset(audio_path, metadata):
49
  """
50
+ Upload audio file to Hugging Face dataset and return URL
51
 
52
  Args:
53
+ audio_path: Local path to audio file
54
+ metadata: Dictionary with generation metadata
55
+
56
+ Returns:
57
+ dict: Upload result with file URL
58
+ """
59
+ try:
60
+ # Generate unique filename
61
+ file_id = str(uuid.uuid4())[:8]
62
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
63
+
64
+ # Create filename with metadata
65
+ voice_name = VOICE_DESCRIPTIONS[metadata["voice_id"]].split(" ")[0]
66
+ emotion_names = ["neutral", "happy", "sad", "excited", "frustrated"]
67
+ emotion_name = emotion_names[metadata["emotion_id"]]
68
+
69
+ filename = f"tts_{timestamp}_{voice_name}_{emotion_name}_{file_id}.mp3"
70
+
71
+ # Path in dataset (organize by date)
72
+ date_path = datetime.now().strftime("%Y/%m/%d")
73
+ dataset_path = f"audio/{date_path}/{filename}"
74
+
75
+ # Upload file to dataset
76
+ upload_file(
77
+ path_or_fileobj=audio_path,
78
+ path_in_repo=dataset_path,
79
+ repo_id=DATASET_REPO,
80
+ repo_type="dataset",
81
+ token=HF_TOKEN
82
+ )
83
+
84
+ # Generate the raw file URL
85
+ file_url = f"https://huggingface.co/datasets/{DATASET_REPO}/resolve/main/{dataset_path}"
86
+
87
+ # Also create/update metadata JSON file
88
+ metadata_entry = {
89
+ "file_id": file_id,
90
+ "filename": filename,
91
+ "dataset_path": dataset_path,
92
+ "file_url": file_url,
93
+ "timestamp": timestamp,
94
+ "text": metadata["text"],
95
+ "voice_id": metadata["voice_id"],
96
+ "voice_name": voice_name,
97
+ "emotion_id": metadata["emotion_id"],
98
+ "emotion_name": emotion_name,
99
+ "speed": metadata["speed"],
100
+ "parameters": metadata["parameters"]
101
+ }
102
+
103
+ # Update metadata index (optional - stores all generations history)
104
+ metadata_filename = f"metadata/{date_path}/{file_id}.json"
105
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
106
+ json.dump(metadata_entry, f, indent=2)
107
+ temp_meta_path = f.name
108
+
109
+ # Upload metadata
110
+ upload_file(
111
+ path_or_fileobj=temp_meta_path,
112
+ path_in_repo=metadata_filename,
113
+ repo_id=DATASET_REPO,
114
+ repo_type="dataset",
115
+ token=HF_TOKEN
116
+ )
117
+
118
+ # Cleanup temp files
119
+ os.unlink(temp_meta_path)
120
+
121
+ return {
122
+ "success": True,
123
+ "file_url": file_url,
124
+ "dataset_path": dataset_path,
125
+ "filename": filename,
126
+ "metadata": metadata_entry
127
+ }
128
+
129
+ except Exception as e:
130
+ return {
131
+ "success": False,
132
+ "error": str(e)
133
+ }
134
+
135
+ async def generate_speech(text, voice_id, emotion_id, speed=1.0):
136
+ """
137
+ Generate speech and save to dataset
138
+
139
+ Returns:
140
+ tuple: (local_audio_path, response_data)
141
  """
142
  try:
143
  # Get voice
 
151
  adjusted_rate = rate_percentage + int((speed - 1.0) * 50)
152
  rate = f"{adjusted_rate:+d}%"
153
 
154
+ # Create communicate object
155
  communicate = edge_tts.Communicate(
156
  text,
157
  voice,
 
162
 
163
  # Generate audio to temporary file
164
  temp_dir = tempfile.mkdtemp()
165
+ local_audio_path = os.path.join(temp_dir, "temp_audio.mp3")
166
 
167
+ await communicate.save(local_audio_path)
168
 
169
+ # Prepare metadata for dataset
170
+ metadata = {
171
+ "text": text,
 
172
  "voice_id": voice_id,
173
+ "voice_description": VOICE_DESCRIPTIONS[voice_id],
174
  "emotion_id": emotion_id,
175
  "speed": speed,
176
  "parameters": {
 
180
  }
181
  }
182
 
183
+ # Upload to dataset
184
+ upload_result = upload_to_dataset(local_audio_path, metadata)
185
+
186
+ # Cleanup temp directory
187
+ shutil.rmtree(temp_dir)
188
+
189
+ if upload_result["success"]:
190
+ # Return both local file (for immediate playback) and dataset URL
191
+ return local_audio_path, {
192
+ "success": True,
193
+ "message": "Audio generated and saved to dataset",
194
+ "audio_url": upload_result["file_url"], # Permanent URL for n8n
195
+ "dataset_path": upload_result["dataset_path"],
196
+ "filename": upload_result["filename"],
197
+ "metadata": upload_result["metadata"],
198
+ "local_audio_available": True # For web interface playback
199
+ }
200
+ else:
201
+ # If upload fails, still return local audio but with warning
202
+ return local_audio_path, {
203
+ "success": True,
204
+ "message": "Audio generated but failed to save to dataset",
205
+ "warning": upload_result["error"],
206
+ "audio_url": None,
207
+ "local_audio_available": True
208
+ }
209
+
210
  except Exception as e:
211
  return None, {
212
  "success": False,
 
223
  return audio_path, metadata
224
 
225
  # Create Gradio interface
226
+ with gr.Blocks(title="Chinese TTS API with Dataset Storage", theme=gr.themes.Soft()) as demo:
227
  gr.Markdown("""
228
+ # πŸŽ™οΈ Chinese TTS API with Hugging Face Dataset Storage
229
+ ### Generate speech and automatically save to dataset with permanent URL
230
 
231
+ ## πŸ”— Dataset Integration
232
+ - Audio files are automatically saved to your Hugging Face dataset
233
+ - Returns permanent URL for use in n8n workflows
234
+ - Files organized by date in the dataset
 
235
  """)
236
 
237
  with gr.Row():
 
262
  label="Speed"
263
  )
264
 
265
+ generate_btn = gr.Button("🎡 Generate & Save to Dataset", variant="primary", size="lg")
266
 
267
  with gr.Column(scale=1):
268
  audio_output = gr.Audio(
269
+ label="Generated Audio (Local)",
270
  type="filepath"
271
  )
272
  json_output = gr.JSON(
273
+ label="Response Data (includes permanent dataset URL)"
274
  )
275
 
276
+ # Show dataset info
277
+ gr.Markdown(f"""
278
+ ### πŸ“Š Dataset Info
279
+ - **Dataset:** `{DATASET_REPO}`
280
+ - Audio files saved to: `/audio/YYYY/MM/DD/`
281
+ - Metadata saved to: `/metadata/YYYY/MM/DD/`
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  """)
283
 
284
+ # Update previews
285
  def update_voice_preview(voice_id):
286
  return f"**Selected:** {VOICE_DESCRIPTIONS[voice_id]}"
287
 
288
  def update_emotion_preview(emotion_id):
289
+ emotions = ["Neutral", "Happy", "Sad", "Exicted", "Frustrated"]
290
  return f"**Selected:** {emotions[emotion_id]}"
291
 
292
  voice_slider.change(
 
308
  outputs=[audio_output, json_output]
309
  )
310
 
311
+ # API endpoint for n8n
312
  async def api_generate(params):
313
+ """API endpoint for n8n - returns permanent dataset URL"""
314
  text = params.get("text", "")
315
  voice_id = int(params.get("voice_id", 1))
316
  emotion_id = int(params.get("emotion_id", 0))
 
321
  if metadata["success"]:
322
  return {
323
  "status": "success",
324
+ "audio_url": metadata.get("audio_url"), # Permanent dataset URL
325
+ "dataset_path": metadata.get("dataset_path"),
326
+ "filename": metadata.get("filename"),
327
+ "metadata": metadata.get("metadata"),
328
+ "message": metadata.get("message", "Audio generated successfully")
329
  }
330
  else:
331
  return {