yukee1992 commited on
Commit
3640d59
·
verified ·
1 Parent(s): 480a42a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -197
app.py CHANGED
@@ -10,10 +10,19 @@ import uuid
10
  from datetime import datetime
11
  import shutil
12
  import re
 
 
 
13
 
14
  # Configuration
15
- HF_TOKEN = os.environ.get("HF_TOKEN") # Set in Space secrets
16
- DATASET_REPO = os.environ.get("DATASET_REPO", "YOUR_USERNAME/video-media-dataset") # Your dataset name
 
 
 
 
 
 
17
 
18
  # Initialize Hugging Face API
19
  hf_api = HfApi(token=HF_TOKEN)
@@ -35,9 +44,20 @@ VOICE_DESCRIPTIONS = {
35
  4: "Professional (Yunxi) - Clear, broadcast"
36
  }
37
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  def sanitize_folder_name(title):
39
  """Convert video title to safe folder name"""
40
- # Remove special characters and replace spaces with underscores
41
  safe_name = re.sub(r'[^\w\s-]', '', title)
42
  safe_name = re.sub(r'[-\s]+', '_', safe_name)
43
  return safe_name.strip('_')
@@ -53,34 +73,22 @@ def get_emotion_params(emotion_id):
53
  }
54
  return emotions.get(emotion_id, emotions[0])
55
 
56
- def upload_to_dataset(audio_path, metadata, video_title):
57
  """
58
  Upload audio file to Hugging Face dataset under video title folder
59
-
60
- Args:
61
- audio_path: Local path to audio file
62
- metadata: Dictionary with generation metadata
63
- video_title: Title of the video (used as folder name)
64
-
65
- Returns:
66
- dict: Upload result with file URL
67
  """
68
  try:
69
- # Create safe folder name from video title
70
- folder_name = sanitize_folder_name(video_title)
71
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
72
  file_id = str(uuid.uuid4())[:8]
73
 
74
- # Get voice and emotion info
75
  voice_name = VOICE_DESCRIPTIONS[metadata["voice_id"]].split(" ")[0]
76
  emotion_names = ["neutral", "happy", "sad", "excited", "frustrated"]
77
  emotion_name = emotion_names[metadata["emotion_id"]]
78
 
79
- # Create filename: [timestamp]_[voice]_[emotion]_[fileid].mp3
80
  filename = f"{timestamp}_{voice_name}_{emotion_name}_{file_id}.mp3"
81
-
82
- # Path in dataset: /[video_title]/audio/[filename]
83
- dataset_path = f"{folder_name}/audio/{filename}"
84
 
85
  # Upload audio file to dataset
86
  upload_file(
@@ -91,10 +99,8 @@ def upload_to_dataset(audio_path, metadata, video_title):
91
  token=HF_TOKEN
92
  )
93
 
94
- # Generate the raw file URL
95
  file_url = f"https://huggingface.co/datasets/{DATASET_REPO}/resolve/main/{dataset_path}"
96
 
97
- # Create metadata entry
98
  metadata_entry = {
99
  "file_id": file_id,
100
  "type": "audio",
@@ -102,7 +108,7 @@ def upload_to_dataset(audio_path, metadata, video_title):
102
  "dataset_path": dataset_path,
103
  "file_url": file_url,
104
  "video_title": video_title,
105
- "video_folder": folder_name,
106
  "timestamp": timestamp,
107
  "text": metadata["text"],
108
  "voice_id": metadata["voice_id"],
@@ -113,26 +119,12 @@ def upload_to_dataset(audio_path, metadata, video_title):
113
  "parameters": metadata["parameters"]
114
  }
115
 
116
- # Update or create video metadata file (stores all assets for this video)
117
- video_metadata_path = f"{folder_name}/metadata.json"
118
-
119
- # Try to download existing metadata if it exists
120
- existing_metadata = []
121
- try:
122
- # This is a simplified approach - in production you'd want to properly manage metadata
123
- pass
124
- except:
125
- existing_metadata = []
126
-
127
- # For now, we'll create a separate metadata file for each audio
128
- # You can enhance this to maintain a single metadata file per video
129
- audio_metadata_path = f"{folder_name}/metadata/audio_{file_id}.json"
130
-
131
  with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
132
  json.dump(metadata_entry, f, indent=2)
133
  temp_meta_path = f.name
134
 
135
- # Upload audio metadata
136
  upload_file(
137
  path_or_fileobj=temp_meta_path,
138
  path_in_repo=audio_metadata_path,
@@ -141,7 +133,6 @@ def upload_to_dataset(audio_path, metadata, video_title):
141
  token=HF_TOKEN
142
  )
143
 
144
- # Cleanup temp files
145
  os.unlink(temp_meta_path)
146
 
147
  return {
@@ -149,7 +140,7 @@ def upload_to_dataset(audio_path, metadata, video_title):
149
  "file_url": file_url,
150
  "dataset_path": dataset_path,
151
  "filename": filename,
152
- "video_folder": folder_name,
153
  "metadata": metadata_entry
154
  }
155
 
@@ -159,26 +150,18 @@ def upload_to_dataset(audio_path, metadata, video_title):
159
  "error": str(e)
160
  }
161
 
162
- async def generate_speech(text, voice_id, emotion_id, speed, video_title):
163
  """
164
- Generate speech and save to dataset under video title folder
165
-
166
- Returns:
167
- tuple: (local_audio_path, response_data)
168
  """
169
  try:
170
- # Get voice
171
  voice = VOICE_MAPPING.get(voice_id, "zh-CN-XiaoxiaoNeural")
172
-
173
- # Get emotion parameters
174
  emotion_params = get_emotion_params(emotion_id)
175
 
176
- # Adjust rate based on speed
177
  rate_percentage = int(emotion_params["rate"].replace("%", "").replace("+", ""))
178
  adjusted_rate = rate_percentage + int((speed - 1.0) * 50)
179
  rate = f"{adjusted_rate:+d}%"
180
 
181
- # Create communicate object
182
  communicate = edge_tts.Communicate(
183
  text,
184
  voice,
@@ -187,13 +170,11 @@ async def generate_speech(text, voice_id, emotion_id, speed, video_title):
187
  volume=emotion_params["volume"]
188
  )
189
 
190
- # Generate audio to temporary file
191
  temp_dir = tempfile.mkdtemp()
192
  local_audio_path = os.path.join(temp_dir, "temp_audio.mp3")
193
 
194
  await communicate.save(local_audio_path)
195
 
196
- # Prepare metadata for dataset
197
  metadata = {
198
  "text": text,
199
  "voice_id": voice_id,
@@ -207,187 +188,112 @@ async def generate_speech(text, voice_id, emotion_id, speed, video_title):
207
  }
208
  }
209
 
210
- # Upload to dataset under video title folder
211
- upload_result = upload_to_dataset(local_audio_path, metadata, video_title)
212
-
213
- # Cleanup temp directory
214
  shutil.rmtree(temp_dir)
215
 
216
  if upload_result["success"]:
217
- return local_audio_path, {
218
  "success": True,
219
- "message": f"Audio generated and saved to dataset under folder: {video_title}",
220
  "video_title": video_title,
221
- "video_folder": upload_result["video_folder"],
222
  "audio_url": upload_result["file_url"],
223
  "dataset_path": upload_result["dataset_path"],
224
  "filename": upload_result["filename"],
225
- "metadata": upload_result["metadata"],
226
- "local_audio_available": True
227
  }
228
  else:
229
- return local_audio_path, {
230
- "success": True,
231
- "message": "Audio generated but failed to save to dataset",
232
- "warning": upload_result["error"],
233
- "audio_url": None,
234
- "local_audio_available": True
235
  }
236
 
237
  except Exception as e:
238
- return None, {
239
  "success": False,
240
  "error": str(e)
241
  }
242
 
243
- def tts_wrapper(text, voice_id, emotion_id, speed, video_title):
244
- """Wrapper function to handle async"""
245
- loop = asyncio.new_event_loop()
246
- asyncio.set_event_loop(loop)
247
- audio_path, metadata = loop.run_until_complete(
248
- generate_speech(text, voice_id, emotion_id, speed, video_title)
249
- )
250
- return audio_path, metadata
251
 
252
- # Create Gradio interface
253
- with gr.Blocks(title="TTS with Dataset Storage by Video Title", theme=gr.themes.Soft()) as demo:
254
- gr.Markdown("""
255
- # 🎙️ TTS API with Hugging Face Dataset Storage
256
- ### Audio files organized by video title folders
257
-
258
- ## 📁 Dataset Structure
259
- ```
260
- your-dataset/
261
- ├── [Video_Title_1]/
262
- │ ├── audio/
263
- │ │ ├── 20240115_143022_Xiaoyi_happy_a1b2.mp3
264
- │ │ └── 20240115_143145_Xiaoxiao_neutral_e5f6.mp3
265
- │ └── metadata/
266
- │ ├── audio_a1b2.json
267
- │ └── audio_e5f6.json
268
- ├── [Video_Title_2]/
269
- │ ├── audio/
270
- │ │ └── 20240115_144512_Yunjian_excited_g7h8.mp3
271
- │ └── metadata/
272
- │ └── audio_g7h8.json
273
- └── images/ (for future image storage)
274
- └── [Video_Title]/
275
- └── thumbnail.jpg
276
- ```
277
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
278
 
279
  with gr.Row():
280
  with gr.Column(scale=1):
281
  video_title_input = gr.Textbox(
282
- label="🎬 Video Title (used as folder name)",
283
  placeholder="Enter video title...",
284
- value="My Awesome Video",
285
- info="This will create a folder with this name in the dataset"
 
 
 
286
  )
287
-
288
  text_input = gr.Textbox(
289
  label="📝 Text to synthesize",
290
  placeholder="输入中文或English...",
291
  lines=3,
292
  value="你好,欢迎使用语音合成服务。"
293
  )
294
-
295
- with gr.Row():
296
- voice_slider = gr.Slider(
297
- minimum=0, maximum=4, step=1, value=1,
298
- label="Voice ID (0-4)"
299
- )
300
- voice_preview = gr.Markdown("**Selected:** Sweet Voice (Xiaoyi)")
301
-
302
- with gr.Row():
303
- emotion_slider = gr.Slider(
304
- minimum=0, maximum=4, step=1, value=0,
305
- label="Emotion ID (0-4)"
306
- )
307
- emotion_preview = gr.Markdown("**Selected:** Neutral")
308
-
309
- speed_slider = gr.Slider(
310
- minimum=0.5, maximum=2.0, step=0.1, value=1.0,
311
- label="Speed"
312
- )
313
-
314
- generate_btn = gr.Button("🎵 Generate & Save to Video Folder", variant="primary", size="lg")
315
 
316
  with gr.Column(scale=1):
317
- audio_output = gr.Audio(
318
- label="Generated Audio",
319
- type="filepath"
320
- )
321
- json_output = gr.JSON(
322
- label="Response Data (includes dataset URL)"
323
- )
324
-
325
- # Show dataset structure preview
326
- gr.Markdown(f"""
327
- ### 📊 Dataset Info
328
- - **Dataset:** `{DATASET_REPO}`
329
- - **Structure:** `/[Video Title]/audio/[file].mp3`
330
- - **Metadata:** `/[Video Title]/metadata/[file_id].json`
331
- """)
332
-
333
- # Update previews
334
- def update_voice_preview(voice_id):
335
- return f"**Selected:** {VOICE_DESCRIPTIONS[voice_id]}"
336
-
337
- def update_emotion_preview(emotion_id):
338
- emotions = ["Neutral", "Happy", "Sad", "Excited", "Frustrated"]
339
- return f"**Selected:** {emotions[emotion_id]}"
340
 
341
- voice_slider.change(
342
- fn=update_voice_preview,
343
- inputs=voice_slider,
344
- outputs=voice_preview
345
- )
346
-
347
- emotion_slider.change(
348
- fn=update_emotion_preview,
349
- inputs=emotion_slider,
350
- outputs=emotion_preview
351
- )
352
-
353
- # Generate button click
354
  generate_btn.click(
355
- fn=tts_wrapper,
356
- inputs=[text_input, voice_slider, emotion_slider, speed_slider, video_title_input],
357
  outputs=[audio_output, json_output]
358
  )
359
 
360
- # API endpoint for n8n
361
- async def api_generate(params):
362
- """API endpoint for n8n - returns permanent dataset URL"""
363
- text = params.get("text", "")
364
- voice_id = int(params.get("voice_id", 1))
365
- emotion_id = int(params.get("emotion_id", 0))
366
- speed = float(params.get("speed", 1.0))
367
- video_title = params.get("video_title", "Untitled Video")
368
-
369
- audio_path, metadata = await generate_speech(text, voice_id, emotion_id, speed, video_title)
370
-
371
- if metadata["success"]:
372
- return {
373
- "status": "success",
374
- "video_title": metadata.get("video_title"),
375
- "video_folder": metadata.get("video_folder"),
376
- "audio_url": metadata.get("audio_url"),
377
- "dataset_path": metadata.get("dataset_path"),
378
- "filename": metadata.get("filename"),
379
- "metadata": metadata.get("metadata"),
380
- "message": metadata.get("message")
381
- }
382
- else:
383
- return {
384
- "status": "error",
385
- "error": metadata["error"]
386
- }
387
 
388
  if __name__ == "__main__":
389
- demo.queue(max_size=50).launch(
390
- server_name="0.0.0.0",
391
- server_port=7860,
392
- show_error=True
393
- )
 
10
  from datetime import datetime
11
  import shutil
12
  import re
13
+ from fastapi import FastAPI, HTTPException
14
+ from fastapi.middleware.cors import CORSMiddleware
15
+ import uvicorn
16
 
17
  # Configuration
18
+ HF_TOKEN = os.environ.get("HF_TOKEN")
19
+ DATASET_REPO = os.environ.get("DATASET_REPO", "yukee1992/video-project-images") # Use same dataset as images
20
+
21
+ print("=" * 60)
22
+ print("🚀 STARTING TTS SERVICE WITH API")
23
+ print("=" * 60)
24
+ print(f"📦 HF Dataset: {DATASET_REPO}")
25
+ print(f"🔑 HF Token: {'✅ Set' if HF_TOKEN else '❌ Missing'}")
26
 
27
  # Initialize Hugging Face API
28
  hf_api = HfApi(token=HF_TOKEN)
 
44
  4: "Professional (Yunxi) - Clear, broadcast"
45
  }
46
 
47
+ # Create FastAPI app
48
+ fastapi_app = FastAPI(title="TTS API")
49
+
50
+ # Add CORS middleware
51
+ fastapi_app.add_middleware(
52
+ CORSMiddleware,
53
+ allow_origins=["*"],
54
+ allow_credentials=True,
55
+ allow_methods=["*"],
56
+ allow_headers=["*"],
57
+ )
58
+
59
  def sanitize_folder_name(title):
60
  """Convert video title to safe folder name"""
 
61
  safe_name = re.sub(r'[^\w\s-]', '', title)
62
  safe_name = re.sub(r'[-\s]+', '_', safe_name)
63
  return safe_name.strip('_')
 
73
  }
74
  return emotions.get(emotion_id, emotions[0])
75
 
76
+ def upload_to_dataset(audio_path, metadata, video_title, project_id=None):
77
  """
78
  Upload audio file to Hugging Face dataset under video title folder
 
 
 
 
 
 
 
 
79
  """
80
  try:
81
+ # Use project_id if provided, otherwise use video_title
82
+ folder_name = project_id if project_id else sanitize_folder_name(video_title)
83
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
84
  file_id = str(uuid.uuid4())[:8]
85
 
 
86
  voice_name = VOICE_DESCRIPTIONS[metadata["voice_id"]].split(" ")[0]
87
  emotion_names = ["neutral", "happy", "sad", "excited", "frustrated"]
88
  emotion_name = emotion_names[metadata["emotion_id"]]
89
 
 
90
  filename = f"{timestamp}_{voice_name}_{emotion_name}_{file_id}.mp3"
91
+ dataset_path = f"data/projects/{folder_name}/audio/{filename}"
 
 
92
 
93
  # Upload audio file to dataset
94
  upload_file(
 
99
  token=HF_TOKEN
100
  )
101
 
 
102
  file_url = f"https://huggingface.co/datasets/{DATASET_REPO}/resolve/main/{dataset_path}"
103
 
 
104
  metadata_entry = {
105
  "file_id": file_id,
106
  "type": "audio",
 
108
  "dataset_path": dataset_path,
109
  "file_url": file_url,
110
  "video_title": video_title,
111
+ "project_id": folder_name,
112
  "timestamp": timestamp,
113
  "text": metadata["text"],
114
  "voice_id": metadata["voice_id"],
 
119
  "parameters": metadata["parameters"]
120
  }
121
 
122
+ # Upload metadata
123
+ audio_metadata_path = f"data/projects/{folder_name}/metadata/audio_{file_id}.json"
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
125
  json.dump(metadata_entry, f, indent=2)
126
  temp_meta_path = f.name
127
 
 
128
  upload_file(
129
  path_or_fileobj=temp_meta_path,
130
  path_in_repo=audio_metadata_path,
 
133
  token=HF_TOKEN
134
  )
135
 
 
136
  os.unlink(temp_meta_path)
137
 
138
  return {
 
140
  "file_url": file_url,
141
  "dataset_path": dataset_path,
142
  "filename": filename,
143
+ "project_id": folder_name,
144
  "metadata": metadata_entry
145
  }
146
 
 
150
  "error": str(e)
151
  }
152
 
153
+ async def generate_speech(text, voice_id, emotion_id, speed, video_title, project_id=None):
154
  """
155
+ Generate speech and save to dataset
 
 
 
156
  """
157
  try:
 
158
  voice = VOICE_MAPPING.get(voice_id, "zh-CN-XiaoxiaoNeural")
 
 
159
  emotion_params = get_emotion_params(emotion_id)
160
 
 
161
  rate_percentage = int(emotion_params["rate"].replace("%", "").replace("+", ""))
162
  adjusted_rate = rate_percentage + int((speed - 1.0) * 50)
163
  rate = f"{adjusted_rate:+d}%"
164
 
 
165
  communicate = edge_tts.Communicate(
166
  text,
167
  voice,
 
170
  volume=emotion_params["volume"]
171
  )
172
 
 
173
  temp_dir = tempfile.mkdtemp()
174
  local_audio_path = os.path.join(temp_dir, "temp_audio.mp3")
175
 
176
  await communicate.save(local_audio_path)
177
 
 
178
  metadata = {
179
  "text": text,
180
  "voice_id": voice_id,
 
188
  }
189
  }
190
 
191
+ upload_result = upload_to_dataset(local_audio_path, metadata, video_title, project_id)
 
 
 
192
  shutil.rmtree(temp_dir)
193
 
194
  if upload_result["success"]:
195
+ return {
196
  "success": True,
197
+ "message": f"Audio generated and saved to dataset",
198
  "video_title": video_title,
199
+ "project_id": upload_result["project_id"],
200
  "audio_url": upload_result["file_url"],
201
  "dataset_path": upload_result["dataset_path"],
202
  "filename": upload_result["filename"],
203
+ "metadata": upload_result["metadata"]
 
204
  }
205
  else:
206
+ return {
207
+ "success": False,
208
+ "error": upload_result["error"]
 
 
 
209
  }
210
 
211
  except Exception as e:
212
+ return {
213
  "success": False,
214
  "error": str(e)
215
  }
216
 
217
+ # =============================================
218
+ # FASTAPI ENDPOINTS FOR n8n
219
+ # =============================================
 
 
 
 
 
220
 
221
+ @fastapi_app.get("/")
222
+ async def root():
223
+ return {
224
+ "name": "TTS API",
225
+ "endpoints": {
226
+ "generate": "POST /api/generate",
227
+ "health": "GET /api/health"
228
+ }
229
+ }
230
+
231
+ @fastapi_app.get("/api/health")
232
+ async def health():
233
+ return {"status": "healthy", "service": "tts"}
234
+
235
+ @fastapi_app.post("/api/generate")
236
+ async def generate_tts(request: dict):
237
+ """API endpoint for n8n - returns permanent dataset URL"""
238
+ try:
239
+ text = request.get("text", "")
240
+ voice_id = int(request.get("voice_id", 1))
241
+ emotion_id = int(request.get("emotion_id", 0))
242
+ speed = float(request.get("speed", 1.0))
243
+ video_title = request.get("video_title", "Untitled Video")
244
+ project_id = request.get("project_id") # Optional project ID from n8n
245
+
246
+ if not text:
247
+ return {"status": "error", "error": "No text provided"}
248
+
249
+ result = await generate_speech(text, voice_id, emotion_id, speed, video_title, project_id)
250
+ return result
251
+
252
+ except Exception as e:
253
+ return {"status": "error", "error": str(e)}
254
+
255
+ # =============================================
256
+ # GRADIO INTERFACE
257
+ # =============================================
258
+ with gr.Blocks(title="TTS with Dataset Storage") as demo:
259
+ gr.Markdown("# 🎙️ TTS API with Hugging Face Dataset Storage")
260
 
261
  with gr.Row():
262
  with gr.Column(scale=1):
263
  video_title_input = gr.Textbox(
264
+ label="🎬 Video Title",
265
  placeholder="Enter video title...",
266
+ value="My Video"
267
+ )
268
+ project_id_input = gr.Textbox(
269
+ label="📁 Project ID (optional)",
270
+ placeholder="Enter project ID if known..."
271
  )
 
272
  text_input = gr.Textbox(
273
  label="📝 Text to synthesize",
274
  placeholder="输入中文或English...",
275
  lines=3,
276
  value="你好,欢迎使用语音合成服务。"
277
  )
278
+ voice_slider = gr.Slider(minimum=0, maximum=4, step=1, value=1, label="Voice ID")
279
+ emotion_slider = gr.Slider(minimum=0, maximum=4, step=1, value=0, label="Emotion ID")
280
+ speed_slider = gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="Speed")
281
+ generate_btn = gr.Button("🎵 Generate", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
 
283
  with gr.Column(scale=1):
284
+ audio_output = gr.Audio(label="Generated Audio", type="filepath")
285
+ json_output = gr.JSON(label="Response Data")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
  generate_btn.click(
288
+ fn=lambda t, v, e, s, vt, p: asyncio.run(generate_speech(t, v, e, s, vt, p)),
289
+ inputs=[text_input, voice_slider, emotion_slider, speed_slider, video_title_input, project_id_input],
290
  outputs=[audio_output, json_output]
291
  )
292
 
293
+ # =============================================
294
+ # MAIN - Mount Gradio to FastAPI
295
+ # =============================================
296
+ app = gr.mount_gradio_app(fastapi_app, demo, path="/")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
 
298
  if __name__ == "__main__":
299
+ uvicorn.run(app, host="0.0.0.0", port=7860)