yukee1992 commited on
Commit
480a42a
Β·
verified Β·
1 Parent(s): 4d01200

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -49
app.py CHANGED
@@ -9,10 +9,11 @@ from huggingface_hub import HfApi, upload_file
9
  import uuid
10
  from datetime import datetime
11
  import shutil
 
12
 
13
  # Configuration
14
- HF_TOKEN = os.environ.get("HF_TOKEN") # You'll set this in Space secrets
15
- DATASET_REPO = os.environ.get("DATASET_REPO", "YOUR_USERNAME/tts-audio-dataset") # Your dataset name
16
 
17
  # Initialize Hugging Face API
18
  hf_api = HfApi(token=HF_TOKEN)
@@ -34,6 +35,13 @@ VOICE_DESCRIPTIONS = {
34
  4: "Professional (Yunxi) - Clear, broadcast"
35
  }
36
 
 
 
 
 
 
 
 
37
  def get_emotion_params(emotion_id):
38
  """Convert emotion ID to speech parameters"""
39
  emotions = {
@@ -45,34 +53,36 @@ def get_emotion_params(emotion_id):
45
  }
46
  return emotions.get(emotion_id, emotions[0])
47
 
48
- def upload_to_dataset(audio_path, metadata):
49
  """
50
- Upload audio file to Hugging Face dataset and return URL
51
 
52
  Args:
53
  audio_path: Local path to audio file
54
  metadata: Dictionary with generation metadata
 
55
 
56
  Returns:
57
  dict: Upload result with file URL
58
  """
59
  try:
60
- # Generate unique filename
61
- file_id = str(uuid.uuid4())[:8]
62
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
 
63
 
64
- # Create filename with metadata
65
  voice_name = VOICE_DESCRIPTIONS[metadata["voice_id"]].split(" ")[0]
66
  emotion_names = ["neutral", "happy", "sad", "excited", "frustrated"]
67
  emotion_name = emotion_names[metadata["emotion_id"]]
68
 
69
- filename = f"tts_{timestamp}_{voice_name}_{emotion_name}_{file_id}.mp3"
 
70
 
71
- # Path in dataset (organize by date)
72
- date_path = datetime.now().strftime("%Y/%m/%d")
73
- dataset_path = f"audio/{date_path}/{filename}"
74
 
75
- # Upload file to dataset
76
  upload_file(
77
  path_or_fileobj=audio_path,
78
  path_in_repo=dataset_path,
@@ -84,12 +94,15 @@ def upload_to_dataset(audio_path, metadata):
84
  # Generate the raw file URL
85
  file_url = f"https://huggingface.co/datasets/{DATASET_REPO}/resolve/main/{dataset_path}"
86
 
87
- # Also create/update metadata JSON file
88
  metadata_entry = {
89
  "file_id": file_id,
 
90
  "filename": filename,
91
  "dataset_path": dataset_path,
92
  "file_url": file_url,
 
 
93
  "timestamp": timestamp,
94
  "text": metadata["text"],
95
  "voice_id": metadata["voice_id"],
@@ -100,16 +113,29 @@ def upload_to_dataset(audio_path, metadata):
100
  "parameters": metadata["parameters"]
101
  }
102
 
103
- # Update metadata index (optional - stores all generations history)
104
- metadata_filename = f"metadata/{date_path}/{file_id}.json"
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
106
  json.dump(metadata_entry, f, indent=2)
107
  temp_meta_path = f.name
108
 
109
- # Upload metadata
110
  upload_file(
111
  path_or_fileobj=temp_meta_path,
112
- path_in_repo=metadata_filename,
113
  repo_id=DATASET_REPO,
114
  repo_type="dataset",
115
  token=HF_TOKEN
@@ -123,6 +149,7 @@ def upload_to_dataset(audio_path, metadata):
123
  "file_url": file_url,
124
  "dataset_path": dataset_path,
125
  "filename": filename,
 
126
  "metadata": metadata_entry
127
  }
128
 
@@ -132,9 +159,9 @@ def upload_to_dataset(audio_path, metadata):
132
  "error": str(e)
133
  }
134
 
135
- async def generate_speech(text, voice_id, emotion_id, speed=1.0):
136
  """
137
- Generate speech and save to dataset
138
 
139
  Returns:
140
  tuple: (local_audio_path, response_data)
@@ -180,25 +207,25 @@ async def generate_speech(text, voice_id, emotion_id, speed=1.0):
180
  }
181
  }
182
 
183
- # Upload to dataset
184
- upload_result = upload_to_dataset(local_audio_path, metadata)
185
 
186
  # Cleanup temp directory
187
  shutil.rmtree(temp_dir)
188
 
189
  if upload_result["success"]:
190
- # Return both local file (for immediate playback) and dataset URL
191
  return local_audio_path, {
192
  "success": True,
193
- "message": "Audio generated and saved to dataset",
194
- "audio_url": upload_result["file_url"], # Permanent URL for n8n
 
 
195
  "dataset_path": upload_result["dataset_path"],
196
  "filename": upload_result["filename"],
197
  "metadata": upload_result["metadata"],
198
- "local_audio_available": True # For web interface playback
199
  }
200
  else:
201
- # If upload fails, still return local audio but with warning
202
  return local_audio_path, {
203
  "success": True,
204
  "message": "Audio generated but failed to save to dataset",
@@ -213,33 +240,55 @@ async def generate_speech(text, voice_id, emotion_id, speed=1.0):
213
  "error": str(e)
214
  }
215
 
216
- def tts_wrapper(text, voice_id, emotion_id, speed):
217
  """Wrapper function to handle async"""
218
  loop = asyncio.new_event_loop()
219
  asyncio.set_event_loop(loop)
220
  audio_path, metadata = loop.run_until_complete(
221
- generate_speech(text, voice_id, emotion_id, speed)
222
  )
223
  return audio_path, metadata
224
 
225
  # Create Gradio interface
226
- with gr.Blocks(title="Chinese TTS API with Dataset Storage", theme=gr.themes.Soft()) as demo:
227
  gr.Markdown("""
228
- # πŸŽ™οΈ Chinese TTS API with Hugging Face Dataset Storage
229
- ### Generate speech and automatically save to dataset with permanent URL
230
 
231
- ## πŸ”— Dataset Integration
232
- - Audio files are automatically saved to your Hugging Face dataset
233
- - Returns permanent URL for use in n8n workflows
234
- - Files organized by date in the dataset
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  """)
236
 
237
  with gr.Row():
238
  with gr.Column(scale=1):
 
 
 
 
 
 
 
239
  text_input = gr.Textbox(
240
- label="πŸ“ Text (ζ”―ζŒδΈ­ζ–‡/English)",
241
- placeholder="θΎ“ε…₯θ¦θ½¬ζ’ηš„ζ–‡ε­—...",
242
- lines=4,
243
  value="δ½ ε₯½οΌŒζ¬’θΏŽδ½Ώη”¨θ―­ιŸ³εˆζˆζœοΏ½οΏ½γ€‚"
244
  )
245
 
@@ -262,23 +311,23 @@ with gr.Blocks(title="Chinese TTS API with Dataset Storage", theme=gr.themes.Sof
262
  label="Speed"
263
  )
264
 
265
- generate_btn = gr.Button("🎡 Generate & Save to Dataset", variant="primary", size="lg")
266
 
267
  with gr.Column(scale=1):
268
  audio_output = gr.Audio(
269
- label="Generated Audio (Local)",
270
  type="filepath"
271
  )
272
  json_output = gr.JSON(
273
- label="Response Data (includes permanent dataset URL)"
274
  )
275
 
276
- # Show dataset info
277
  gr.Markdown(f"""
278
  ### πŸ“Š Dataset Info
279
  - **Dataset:** `{DATASET_REPO}`
280
- - Audio files saved to: `/audio/YYYY/MM/DD/`
281
- - Metadata saved to: `/metadata/YYYY/MM/DD/`
282
  """)
283
 
284
  # Update previews
@@ -286,7 +335,7 @@ with gr.Blocks(title="Chinese TTS API with Dataset Storage", theme=gr.themes.Sof
286
  return f"**Selected:** {VOICE_DESCRIPTIONS[voice_id]}"
287
 
288
  def update_emotion_preview(emotion_id):
289
- emotions = ["Neutral", "Happy", "Sad", "Exicted", "Frustrated"]
290
  return f"**Selected:** {emotions[emotion_id]}"
291
 
292
  voice_slider.change(
@@ -304,7 +353,7 @@ with gr.Blocks(title="Chinese TTS API with Dataset Storage", theme=gr.themes.Sof
304
  # Generate button click
305
  generate_btn.click(
306
  fn=tts_wrapper,
307
- inputs=[text_input, voice_slider, emotion_slider, speed_slider],
308
  outputs=[audio_output, json_output]
309
  )
310
 
@@ -315,17 +364,20 @@ async def api_generate(params):
315
  voice_id = int(params.get("voice_id", 1))
316
  emotion_id = int(params.get("emotion_id", 0))
317
  speed = float(params.get("speed", 1.0))
 
318
 
319
- audio_path, metadata = await generate_speech(text, voice_id, emotion_id, speed)
320
 
321
  if metadata["success"]:
322
  return {
323
  "status": "success",
324
- "audio_url": metadata.get("audio_url"), # Permanent dataset URL
 
 
325
  "dataset_path": metadata.get("dataset_path"),
326
  "filename": metadata.get("filename"),
327
  "metadata": metadata.get("metadata"),
328
- "message": metadata.get("message", "Audio generated successfully")
329
  }
330
  else:
331
  return {
 
9
  import uuid
10
  from datetime import datetime
11
  import shutil
12
+ import re
13
 
14
  # Configuration
15
+ HF_TOKEN = os.environ.get("HF_TOKEN") # Set in Space secrets
16
+ DATASET_REPO = os.environ.get("DATASET_REPO", "YOUR_USERNAME/video-media-dataset") # Your dataset name
17
 
18
  # Initialize Hugging Face API
19
  hf_api = HfApi(token=HF_TOKEN)
 
35
  4: "Professional (Yunxi) - Clear, broadcast"
36
  }
37
 
38
+ def sanitize_folder_name(title):
39
+ """Convert video title to safe folder name"""
40
+ # Remove special characters and replace spaces with underscores
41
+ safe_name = re.sub(r'[^\w\s-]', '', title)
42
+ safe_name = re.sub(r'[-\s]+', '_', safe_name)
43
+ return safe_name.strip('_')
44
+
45
  def get_emotion_params(emotion_id):
46
  """Convert emotion ID to speech parameters"""
47
  emotions = {
 
53
  }
54
  return emotions.get(emotion_id, emotions[0])
55
 
56
+ def upload_to_dataset(audio_path, metadata, video_title):
57
  """
58
+ Upload audio file to Hugging Face dataset under video title folder
59
 
60
  Args:
61
  audio_path: Local path to audio file
62
  metadata: Dictionary with generation metadata
63
+ video_title: Title of the video (used as folder name)
64
 
65
  Returns:
66
  dict: Upload result with file URL
67
  """
68
  try:
69
+ # Create safe folder name from video title
70
+ folder_name = sanitize_folder_name(video_title)
71
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
72
+ file_id = str(uuid.uuid4())[:8]
73
 
74
+ # Get voice and emotion info
75
  voice_name = VOICE_DESCRIPTIONS[metadata["voice_id"]].split(" ")[0]
76
  emotion_names = ["neutral", "happy", "sad", "excited", "frustrated"]
77
  emotion_name = emotion_names[metadata["emotion_id"]]
78
 
79
+ # Create filename: [timestamp]_[voice]_[emotion]_[fileid].mp3
80
+ filename = f"{timestamp}_{voice_name}_{emotion_name}_{file_id}.mp3"
81
 
82
+ # Path in dataset: /[video_title]/audio/[filename]
83
+ dataset_path = f"{folder_name}/audio/{filename}"
 
84
 
85
+ # Upload audio file to dataset
86
  upload_file(
87
  path_or_fileobj=audio_path,
88
  path_in_repo=dataset_path,
 
94
  # Generate the raw file URL
95
  file_url = f"https://huggingface.co/datasets/{DATASET_REPO}/resolve/main/{dataset_path}"
96
 
97
+ # Create metadata entry
98
  metadata_entry = {
99
  "file_id": file_id,
100
+ "type": "audio",
101
  "filename": filename,
102
  "dataset_path": dataset_path,
103
  "file_url": file_url,
104
+ "video_title": video_title,
105
+ "video_folder": folder_name,
106
  "timestamp": timestamp,
107
  "text": metadata["text"],
108
  "voice_id": metadata["voice_id"],
 
113
  "parameters": metadata["parameters"]
114
  }
115
 
116
+ # Update or create video metadata file (stores all assets for this video)
117
+ video_metadata_path = f"{folder_name}/metadata.json"
118
+
119
+ # Try to download existing metadata if it exists
120
+ existing_metadata = []
121
+ try:
122
+ # This is a simplified approach - in production you'd want to properly manage metadata
123
+ pass
124
+ except:
125
+ existing_metadata = []
126
+
127
+ # For now, we'll create a separate metadata file for each audio
128
+ # You can enhance this to maintain a single metadata file per video
129
+ audio_metadata_path = f"{folder_name}/metadata/audio_{file_id}.json"
130
+
131
  with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
132
  json.dump(metadata_entry, f, indent=2)
133
  temp_meta_path = f.name
134
 
135
+ # Upload audio metadata
136
  upload_file(
137
  path_or_fileobj=temp_meta_path,
138
+ path_in_repo=audio_metadata_path,
139
  repo_id=DATASET_REPO,
140
  repo_type="dataset",
141
  token=HF_TOKEN
 
149
  "file_url": file_url,
150
  "dataset_path": dataset_path,
151
  "filename": filename,
152
+ "video_folder": folder_name,
153
  "metadata": metadata_entry
154
  }
155
 
 
159
  "error": str(e)
160
  }
161
 
162
+ async def generate_speech(text, voice_id, emotion_id, speed, video_title):
163
  """
164
+ Generate speech and save to dataset under video title folder
165
 
166
  Returns:
167
  tuple: (local_audio_path, response_data)
 
207
  }
208
  }
209
 
210
+ # Upload to dataset under video title folder
211
+ upload_result = upload_to_dataset(local_audio_path, metadata, video_title)
212
 
213
  # Cleanup temp directory
214
  shutil.rmtree(temp_dir)
215
 
216
  if upload_result["success"]:
 
217
  return local_audio_path, {
218
  "success": True,
219
+ "message": f"Audio generated and saved to dataset under folder: {video_title}",
220
+ "video_title": video_title,
221
+ "video_folder": upload_result["video_folder"],
222
+ "audio_url": upload_result["file_url"],
223
  "dataset_path": upload_result["dataset_path"],
224
  "filename": upload_result["filename"],
225
  "metadata": upload_result["metadata"],
226
+ "local_audio_available": True
227
  }
228
  else:
 
229
  return local_audio_path, {
230
  "success": True,
231
  "message": "Audio generated but failed to save to dataset",
 
240
  "error": str(e)
241
  }
242
 
243
+ def tts_wrapper(text, voice_id, emotion_id, speed, video_title):
244
  """Wrapper function to handle async"""
245
  loop = asyncio.new_event_loop()
246
  asyncio.set_event_loop(loop)
247
  audio_path, metadata = loop.run_until_complete(
248
+ generate_speech(text, voice_id, emotion_id, speed, video_title)
249
  )
250
  return audio_path, metadata
251
 
252
  # Create Gradio interface
253
+ with gr.Blocks(title="TTS with Dataset Storage by Video Title", theme=gr.themes.Soft()) as demo:
254
  gr.Markdown("""
255
+ # πŸŽ™οΈ TTS API with Hugging Face Dataset Storage
256
+ ### Audio files organized by video title folders
257
 
258
+ ## πŸ“ Dataset Structure
259
+ ```
260
+ your-dataset/
261
+ β”œβ”€β”€ [Video_Title_1]/
262
+ β”‚ β”œβ”€β”€ audio/
263
+ β”‚ β”‚ β”œβ”€β”€ 20240115_143022_Xiaoyi_happy_a1b2.mp3
264
+ β”‚ β”‚ └── 20240115_143145_Xiaoxiao_neutral_e5f6.mp3
265
+ β”‚ └── metadata/
266
+ β”‚ β”œβ”€β”€ audio_a1b2.json
267
+ β”‚ └── audio_e5f6.json
268
+ β”œβ”€β”€ [Video_Title_2]/
269
+ β”‚ β”œβ”€β”€ audio/
270
+ β”‚ β”‚ └── 20240115_144512_Yunjian_excited_g7h8.mp3
271
+ β”‚ └── metadata/
272
+ β”‚ └── audio_g7h8.json
273
+ └── images/ (for future image storage)
274
+ └── [Video_Title]/
275
+ └── thumbnail.jpg
276
+ ```
277
  """)
278
 
279
  with gr.Row():
280
  with gr.Column(scale=1):
281
+ video_title_input = gr.Textbox(
282
+ label="🎬 Video Title (used as folder name)",
283
+ placeholder="Enter video title...",
284
+ value="My Awesome Video",
285
+ info="This will create a folder with this name in the dataset"
286
+ )
287
+
288
  text_input = gr.Textbox(
289
+ label="πŸ“ Text to synthesize",
290
+ placeholder="θΎ“ε…₯δΈ­ζ–‡ζˆ–English...",
291
+ lines=3,
292
  value="δ½ ε₯½οΌŒζ¬’θΏŽδ½Ώη”¨θ―­ιŸ³εˆζˆζœοΏ½οΏ½γ€‚"
293
  )
294
 
 
311
  label="Speed"
312
  )
313
 
314
+ generate_btn = gr.Button("🎡 Generate & Save to Video Folder", variant="primary", size="lg")
315
 
316
  with gr.Column(scale=1):
317
  audio_output = gr.Audio(
318
+ label="Generated Audio",
319
  type="filepath"
320
  )
321
  json_output = gr.JSON(
322
+ label="Response Data (includes dataset URL)"
323
  )
324
 
325
+ # Show dataset structure preview
326
  gr.Markdown(f"""
327
  ### πŸ“Š Dataset Info
328
  - **Dataset:** `{DATASET_REPO}`
329
+ - **Structure:** `/[Video Title]/audio/[file].mp3`
330
+ - **Metadata:** `/[Video Title]/metadata/[file_id].json`
331
  """)
332
 
333
  # Update previews
 
335
  return f"**Selected:** {VOICE_DESCRIPTIONS[voice_id]}"
336
 
337
  def update_emotion_preview(emotion_id):
338
+ emotions = ["Neutral", "Happy", "Sad", "Excited", "Frustrated"]
339
  return f"**Selected:** {emotions[emotion_id]}"
340
 
341
  voice_slider.change(
 
353
  # Generate button click
354
  generate_btn.click(
355
  fn=tts_wrapper,
356
+ inputs=[text_input, voice_slider, emotion_slider, speed_slider, video_title_input],
357
  outputs=[audio_output, json_output]
358
  )
359
 
 
364
  voice_id = int(params.get("voice_id", 1))
365
  emotion_id = int(params.get("emotion_id", 0))
366
  speed = float(params.get("speed", 1.0))
367
+ video_title = params.get("video_title", "Untitled Video")
368
 
369
+ audio_path, metadata = await generate_speech(text, voice_id, emotion_id, speed, video_title)
370
 
371
  if metadata["success"]:
372
  return {
373
  "status": "success",
374
+ "video_title": metadata.get("video_title"),
375
+ "video_folder": metadata.get("video_folder"),
376
+ "audio_url": metadata.get("audio_url"),
377
  "dataset_path": metadata.get("dataset_path"),
378
  "filename": metadata.get("filename"),
379
  "metadata": metadata.get("metadata"),
380
+ "message": metadata.get("message")
381
  }
382
  else:
383
  return {