Spaces:

yukee1992
/

Tts-api-new

Paused

App Files Files Community

yukee1992 commited on Mar 6

Commit

480a42a

verified ·

1 Parent(s): 4d01200

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -49

app.py CHANGED Viewed

@@ -9,10 +9,11 @@ from huggingface_hub import HfApi, upload_file
 import uuid
 from datetime import datetime
 import shutil
 # Configuration
-HF_TOKEN = os.environ.get("HF_TOKEN")  # You'll set this in Space secrets
-DATASET_REPO = os.environ.get("DATASET_REPO", "YOUR_USERNAME/tts-audio-dataset")  # Your dataset name
 # Initialize Hugging Face API
 hf_api = HfApi(token=HF_TOKEN)
@@ -34,6 +35,13 @@ VOICE_DESCRIPTIONS = {
     4: "Professional (Yunxi) - Clear, broadcast"
 }
 def get_emotion_params(emotion_id):
     """Convert emotion ID to speech parameters"""
     emotions = {
@@ -45,34 +53,36 @@ def get_emotion_params(emotion_id):
     }
     return emotions.get(emotion_id, emotions[0])
-def upload_to_dataset(audio_path, metadata):
     """
-    Upload audio file to Hugging Face dataset and return URL
     Args:
         audio_path: Local path to audio file
         metadata: Dictionary with generation metadata
     Returns:
         dict: Upload result with file URL
     """
     try:
-        # Generate unique filename
-        file_id = str(uuid.uuid4())[:8]
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        # Create filename with metadata
         voice_name = VOICE_DESCRIPTIONS[metadata["voice_id"]].split(" ")[0]
         emotion_names = ["neutral", "happy", "sad", "excited", "frustrated"]
         emotion_name = emotion_names[metadata["emotion_id"]]
-        filename = f"tts_{timestamp}_{voice_name}_{emotion_name}_{file_id}.mp3"
-        # Path in dataset (organize by date)
-        date_path = datetime.now().strftime("%Y/%m/%d")
-        dataset_path = f"audio/{date_path}/{filename}"
-        # Upload file to dataset
         upload_file(
             path_or_fileobj=audio_path,
             path_in_repo=dataset_path,
@@ -84,12 +94,15 @@ def upload_to_dataset(audio_path, metadata):
         # Generate the raw file URL
         file_url = f"https://huggingface.co/datasets/{DATASET_REPO}/resolve/main/{dataset_path}"
-        # Also create/update metadata JSON file
         metadata_entry = {
             "file_id": file_id,
             "filename": filename,
             "dataset_path": dataset_path,
             "file_url": file_url,
             "timestamp": timestamp,
             "text": metadata["text"],
             "voice_id": metadata["voice_id"],
@@ -100,16 +113,29 @@ def upload_to_dataset(audio_path, metadata):
             "parameters": metadata["parameters"]
         }
-        # Update metadata index (optional - stores all generations history)
-        metadata_filename = f"metadata/{date_path}/{file_id}.json"
         with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
             json.dump(metadata_entry, f, indent=2)
             temp_meta_path = f.name
-        # Upload metadata
         upload_file(
             path_or_fileobj=temp_meta_path,
-            path_in_repo=metadata_filename,
             repo_id=DATASET_REPO,
             repo_type="dataset",
             token=HF_TOKEN
@@ -123,6 +149,7 @@ def upload_to_dataset(audio_path, metadata):
             "file_url": file_url,
             "dataset_path": dataset_path,
             "filename": filename,
             "metadata": metadata_entry
         }
@@ -132,9 +159,9 @@ def upload_to_dataset(audio_path, metadata):
             "error": str(e)
         }
-async def generate_speech(text, voice_id, emotion_id, speed=1.0):
     """
-    Generate speech and save to dataset
     Returns:
         tuple: (local_audio_path, response_data)
@@ -180,25 +207,25 @@ async def generate_speech(text, voice_id, emotion_id, speed=1.0):
             }
         }
-        # Upload to dataset
-        upload_result = upload_to_dataset(local_audio_path, metadata)
         # Cleanup temp directory
         shutil.rmtree(temp_dir)
         if upload_result["success"]:
-            # Return both local file (for immediate playback) and dataset URL
             return local_audio_path, {
                 "success": True,
-                "message": "Audio generated and saved to dataset",
-                "audio_url": upload_result["file_url"],  # Permanent URL for n8n
                 "dataset_path": upload_result["dataset_path"],
                 "filename": upload_result["filename"],
                 "metadata": upload_result["metadata"],
-                "local_audio_available": True  # For web interface playback
             }
         else:
-            # If upload fails, still return local audio but with warning
             return local_audio_path, {
                 "success": True,
                 "message": "Audio generated but failed to save to dataset",
@@ -213,33 +240,55 @@ async def generate_speech(text, voice_id, emotion_id, speed=1.0):
             "error": str(e)
         }
-def tts_wrapper(text, voice_id, emotion_id, speed):
     """Wrapper function to handle async"""
     loop = asyncio.new_event_loop()
     asyncio.set_event_loop(loop)
     audio_path, metadata = loop.run_until_complete(
-        generate_speech(text, voice_id, emotion_id, speed)
     )
     return audio_path, metadata
 # Create Gradio interface
-with gr.Blocks(title="Chinese TTS API with Dataset Storage", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
-    # 🎙️ Chinese TTS API with Hugging Face Dataset Storage
-    ### Generate speech and automatically save to dataset with permanent URL
-    ## 🔗 Dataset Integration
-    - Audio files are automatically saved to your Hugging Face dataset
-    - Returns permanent URL for use in n8n workflows
-    - Files organized by date in the dataset
     """)
     with gr.Row():
         with gr.Column(scale=1):
             text_input = gr.Textbox(
-                label="📝 Text (支持中文/English)",
-                placeholder="输入要转换的文字...",
-                lines=4,
                 value="你好，欢迎使用语音合成服��。"
             )
@@ -262,23 +311,23 @@ with gr.Blocks(title="Chinese TTS API with Dataset Storage", theme=gr.themes.Sof
                 label="Speed"
             )
-            generate_btn = gr.Button("🎵 Generate & Save to Dataset", variant="primary", size="lg")
         with gr.Column(scale=1):
             audio_output = gr.Audio(
-                label="Generated Audio (Local)",
                 type="filepath"
             )
             json_output = gr.JSON(
-                label="Response Data (includes permanent dataset URL)"
             )
-            # Show dataset info
             gr.Markdown(f"""
             ### 📊 Dataset Info
             - **Dataset:** `{DATASET_REPO}`
-            - Audio files saved to: `/audio/YYYY/MM/DD/`
-            - Metadata saved to: `/metadata/YYYY/MM/DD/`
             """)
     # Update previews
@@ -286,7 +335,7 @@ with gr.Blocks(title="Chinese TTS API with Dataset Storage", theme=gr.themes.Sof
         return f"**Selected:** {VOICE_DESCRIPTIONS[voice_id]}"
     def update_emotion_preview(emotion_id):
-        emotions = ["Neutral", "Happy", "Sad", "Exicted", "Frustrated"]
         return f"**Selected:** {emotions[emotion_id]}"
     voice_slider.change(
@@ -304,7 +353,7 @@ with gr.Blocks(title="Chinese TTS API with Dataset Storage", theme=gr.themes.Sof
     # Generate button click
     generate_btn.click(
         fn=tts_wrapper,
-        inputs=[text_input, voice_slider, emotion_slider, speed_slider],
         outputs=[audio_output, json_output]
     )
@@ -315,17 +364,20 @@ async def api_generate(params):
     voice_id = int(params.get("voice_id", 1))
     emotion_id = int(params.get("emotion_id", 0))
     speed = float(params.get("speed", 1.0))
-    audio_path, metadata = await generate_speech(text, voice_id, emotion_id, speed)
     if metadata["success"]:
         return {
             "status": "success",
-            "audio_url": metadata.get("audio_url"),  # Permanent dataset URL
             "dataset_path": metadata.get("dataset_path"),
             "filename": metadata.get("filename"),
             "metadata": metadata.get("metadata"),
-            "message": metadata.get("message", "Audio generated successfully")
         }
     else:
         return {

 import uuid
 from datetime import datetime
 import shutil
+import re
 # Configuration
+HF_TOKEN = os.environ.get("HF_TOKEN")  # Set in Space secrets
+DATASET_REPO = os.environ.get("DATASET_REPO", "YOUR_USERNAME/video-media-dataset")  # Your dataset name
 # Initialize Hugging Face API
 hf_api = HfApi(token=HF_TOKEN)
     4: "Professional (Yunxi) - Clear, broadcast"
 }
+def sanitize_folder_name(title):
+    """Convert video title to safe folder name"""
+    # Remove special characters and replace spaces with underscores
+    safe_name = re.sub(r'[^\w\s-]', '', title)
+    safe_name = re.sub(r'[-\s]+', '_', safe_name)
+    return safe_name.strip('_')
 def get_emotion_params(emotion_id):
     """Convert emotion ID to speech parameters"""
     emotions = {
     }
     return emotions.get(emotion_id, emotions[0])
+def upload_to_dataset(audio_path, metadata, video_title):
     """
+    Upload audio file to Hugging Face dataset under video title folder
     Args:
         audio_path: Local path to audio file
         metadata: Dictionary with generation metadata
+        video_title: Title of the video (used as folder name)
     Returns:
         dict: Upload result with file URL
     """
     try:
+        # Create safe folder name from video title
+        folder_name = sanitize_folder_name(video_title)
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        file_id = str(uuid.uuid4())[:8]
+        # Get voice and emotion info
         voice_name = VOICE_DESCRIPTIONS[metadata["voice_id"]].split(" ")[0]
         emotion_names = ["neutral", "happy", "sad", "excited", "frustrated"]
         emotion_name = emotion_names[metadata["emotion_id"]]
+        # Create filename: [timestamp]_[voice]_[emotion]_[fileid].mp3
+        filename = f"{timestamp}_{voice_name}_{emotion_name}_{file_id}.mp3"
+        # Path in dataset: /[video_title]/audio/[filename]
+        dataset_path = f"{folder_name}/audio/{filename}"
+        # Upload audio file to dataset
         upload_file(
             path_or_fileobj=audio_path,
             path_in_repo=dataset_path,
         # Generate the raw file URL
         file_url = f"https://huggingface.co/datasets/{DATASET_REPO}/resolve/main/{dataset_path}"
+        # Create metadata entry
         metadata_entry = {
             "file_id": file_id,
+            "type": "audio",
             "filename": filename,
             "dataset_path": dataset_path,
             "file_url": file_url,
+            "video_title": video_title,
+            "video_folder": folder_name,
             "timestamp": timestamp,
             "text": metadata["text"],
             "voice_id": metadata["voice_id"],
             "parameters": metadata["parameters"]
         }
+        # Update or create video metadata file (stores all assets for this video)
+        video_metadata_path = f"{folder_name}/metadata.json"
+        # Try to download existing metadata if it exists
+        existing_metadata = []
+        try:
+            # This is a simplified approach - in production you'd want to properly manage metadata
+            pass
+        except:
+            existing_metadata = []
+        # For now, we'll create a separate metadata file for each audio
+        # You can enhance this to maintain a single metadata file per video
+        audio_metadata_path = f"{folder_name}/metadata/audio_{file_id}.json"
         with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
             json.dump(metadata_entry, f, indent=2)
             temp_meta_path = f.name
+        # Upload audio metadata
         upload_file(
             path_or_fileobj=temp_meta_path,
+            path_in_repo=audio_metadata_path,
             repo_id=DATASET_REPO,
             repo_type="dataset",
             token=HF_TOKEN
             "file_url": file_url,
             "dataset_path": dataset_path,
             "filename": filename,
+            "video_folder": folder_name,
             "metadata": metadata_entry
         }
             "error": str(e)
         }
+async def generate_speech(text, voice_id, emotion_id, speed, video_title):
     """
+    Generate speech and save to dataset under video title folder
     Returns:
         tuple: (local_audio_path, response_data)
             }
         }
+        # Upload to dataset under video title folder
+        upload_result = upload_to_dataset(local_audio_path, metadata, video_title)
         # Cleanup temp directory
         shutil.rmtree(temp_dir)
         if upload_result["success"]:
             return local_audio_path, {
                 "success": True,
+                "message": f"Audio generated and saved to dataset under folder: {video_title}",
+                "video_title": video_title,
+                "video_folder": upload_result["video_folder"],
+                "audio_url": upload_result["file_url"],
                 "dataset_path": upload_result["dataset_path"],
                 "filename": upload_result["filename"],
                 "metadata": upload_result["metadata"],
+                "local_audio_available": True
             }
         else:
             return local_audio_path, {
                 "success": True,
                 "message": "Audio generated but failed to save to dataset",
             "error": str(e)
         }
+def tts_wrapper(text, voice_id, emotion_id, speed, video_title):
     """Wrapper function to handle async"""
     loop = asyncio.new_event_loop()
     asyncio.set_event_loop(loop)
     audio_path, metadata = loop.run_until_complete(
+        generate_speech(text, voice_id, emotion_id, speed, video_title)
     )
     return audio_path, metadata
 # Create Gradio interface
+with gr.Blocks(title="TTS with Dataset Storage by Video Title", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
+    # 🎙️ TTS API with Hugging Face Dataset Storage
+    ### Audio files organized by video title folders
+    ## 📁 Dataset Structure
+    ```
+    your-dataset/
+    ├── [Video_Title_1]/
+    │   ├── audio/
+    │   │   ├── 20240115_143022_Xiaoyi_happy_a1b2.mp3
+    │   │   └── 20240115_143145_Xiaoxiao_neutral_e5f6.mp3
+    │   └── metadata/
+    │       ├── audio_a1b2.json
+    │       └── audio_e5f6.json
+    ├── [Video_Title_2]/
+    │   ├── audio/
+    │   │   └── 20240115_144512_Yunjian_excited_g7h8.mp3
+    │   └── metadata/
+    │       └── audio_g7h8.json
+    └── images/  (for future image storage)
+        └── [Video_Title]/
+            └── thumbnail.jpg
+    ```
     """)
     with gr.Row():
         with gr.Column(scale=1):
+            video_title_input = gr.Textbox(
+                label="🎬 Video Title (used as folder name)",
+                placeholder="Enter video title...",
+                value="My Awesome Video",
+                info="This will create a folder with this name in the dataset"
+            )
             text_input = gr.Textbox(
+                label="📝 Text to synthesize",
+                placeholder="输入中文或English...",
+                lines=3,
                 value="你好，欢迎使用语音合成服��。"
             )
                 label="Speed"
             )
+            generate_btn = gr.Button("🎵 Generate & Save to Video Folder", variant="primary", size="lg")
         with gr.Column(scale=1):
             audio_output = gr.Audio(
+                label="Generated Audio",
                 type="filepath"
             )
             json_output = gr.JSON(
+                label="Response Data (includes dataset URL)"
             )
+            # Show dataset structure preview
             gr.Markdown(f"""
             ### 📊 Dataset Info
             - **Dataset:** `{DATASET_REPO}`
+            - **Structure:** `/[Video Title]/audio/[file].mp3`
+            - **Metadata:** `/[Video Title]/metadata/[file_id].json`
             """)
     # Update previews
         return f"**Selected:** {VOICE_DESCRIPTIONS[voice_id]}"
     def update_emotion_preview(emotion_id):
+        emotions = ["Neutral", "Happy", "Sad", "Excited", "Frustrated"]
         return f"**Selected:** {emotions[emotion_id]}"
     voice_slider.change(
     # Generate button click
     generate_btn.click(
         fn=tts_wrapper,
+        inputs=[text_input, voice_slider, emotion_slider, speed_slider, video_title_input],
         outputs=[audio_output, json_output]
     )
     voice_id = int(params.get("voice_id", 1))
     emotion_id = int(params.get("emotion_id", 0))
     speed = float(params.get("speed", 1.0))
+    video_title = params.get("video_title", "Untitled Video")
+    audio_path, metadata = await generate_speech(text, voice_id, emotion_id, speed, video_title)
     if metadata["success"]:
         return {
             "status": "success",
+            "video_title": metadata.get("video_title"),
+            "video_folder": metadata.get("video_folder"),
+            "audio_url": metadata.get("audio_url"),
             "dataset_path": metadata.get("dataset_path"),
             "filename": metadata.get("filename"),
             "metadata": metadata.get("metadata"),
+            "message": metadata.get("message")
         }
     else:
         return {