Spaces:

yukee1992
/

Tts-api-new

Paused

App Files Files Community

yukee1992 commited on Mar 5

Commit

f388252

verified ·

1 Parent(s): 477e22f

Update app.py

Browse files

Files changed (1) hide show

app.py +166 -59

app.py CHANGED Viewed

@@ -5,17 +5,27 @@ import tempfile
 import os
 import json
 from pathlib import Path
-# Chinese voice options with different characteristics
 VOICE_MAPPING = {
-    0: "zh-CN-XiaoxiaoNeural",    # Loyal Sister - Gentle, warm
-    1: "zh-CN-XiaoyiNeural",      # Sweet Voice - Lively, cute
-    2: "zh-CN-YunjianNeural",     # Cool Voice - Deep, calm
-    3: "zh-CN-XiaomengNeural",    # Loli Voice - Childish, energetic
-    4: "zh-CN-YunxiNeural",       # Professional - Clear, broadcast
 }
-# Voice style descriptions
 VOICE_DESCRIPTIONS = {
     0: "Loyal Sister (Xiaoxiao) - Warm, caring",
     1: "Sweet Voice (Xiaoyi) - Lively, cute",
@@ -24,7 +34,6 @@ VOICE_DESCRIPTIONS = {
     4: "Professional (Yunxi) - Clear, broadcast"
 }
-# Emotion mapping through speech rate and pitch
 def get_emotion_params(emotion_id):
     """Convert emotion ID to speech parameters"""
     emotions = {
@@ -36,15 +45,99 @@ def get_emotion_params(emotion_id):
     }
     return emotions.get(emotion_id, emotions[0])
-async def generate_speech(text, voice_id, emotion_id, speed=1.0):
     """
-    Generate speech using Edge TTS
     Args:
-        text: Text to synthesize (Chinese or English)
-        voice_id: 0-4 for different voice types
-        emotion_id: 0-4 for different emotions
-        speed: Speech rate multiplier
     """
     try:
         # Get voice
@@ -58,7 +151,7 @@ async def generate_speech(text, voice_id, emotion_id, speed=1.0):
         adjusted_rate = rate_percentage + int((speed - 1.0) * 50)
         rate = f"{adjusted_rate:+d}%"
-        # Create communicate object with parameters
         communicate = edge_tts.Communicate(
             text,
             voice,
@@ -69,15 +162,15 @@ async def generate_speech(text, voice_id, emotion_id, speed=1.0):
         # Generate audio to temporary file
         temp_dir = tempfile.mkdtemp()
-        output_path = os.path.join(temp_dir, "output.mp3")
-        await communicate.save(output_path)
-        # Return audio file path and metadata
-        return output_path, {
-            "success": True,
-            "voice": VOICE_DESCRIPTIONS[voice_id],
             "voice_id": voice_id,
             "emotion_id": emotion_id,
             "speed": speed,
             "parameters": {
@@ -87,6 +180,33 @@ async def generate_speech(text, voice_id, emotion_id, speed=1.0):
             }
         }
     except Exception as e:
         return None, {
             "success": False,
@@ -103,16 +223,15 @@ def tts_wrapper(text, voice_id, emotion_id, speed):
     return audio_path, metadata
 # Create Gradio interface
-with gr.Blocks(title="Chinese TTS API for n8n", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
-    # 🎙️ Chinese TTS API for n8n
-    ### Stable Edge TTS backend with voice and emotion control
-    | Parameter | Range | Description |
-    |-----------|-------|-------------|
-    | Voice ID | 0-4 | Different voice characteristics |
-    | Emotion ID | 0-4 | Emotional expression |
-    | Speed | 0.5-2.0 | Speech rate |
     """)
     with gr.Row():
@@ -143,46 +262,31 @@ with gr.Blocks(title="Chinese TTS API for n8n", theme=gr.themes.Soft()) as demo:
                 label="Speed"
             )
-            generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
         with gr.Column(scale=1):
             audio_output = gr.Audio(
-                label="Generated Audio",
                 type="filepath"
             )
             json_output = gr.JSON(
-                label="Response Data (for n8n)"
             )
-            # Voice reference table
-            gr.Markdown("""
-            ### Voice Reference
-            | ID | Voice | Description |
-            |----|-------|-------------|
-            | 0 | Xiaoxiao | Loyal Sister - Warm, caring |
-            | 1 | Xiaoyi | Sweet Voice - Lively, cute |
-            | 2 | Yunjian | Cool Voice - Deep, calm |
-            | 3 | Xiaomeng | Loli Voice - Childish |
-            | 4 | Yunxi | Professional - Clear |
-            ### Emotion Reference
-            | ID | Emotion | Effect |
-            |----|---------|--------|
-            | 0 | Neutral | Normal speech |
-            | 1 | Happy | Higher pitch, faster |
-            | 2 | Sad | Lower pitch, slower |
-            | 3 | Excited | High energy, fast |
-            | 4 | Frustrated | Tense, emphasized |
             """)
-    # Update previews when sliders change
     def update_voice_preview(voice_id):
         return f"**Selected:** {VOICE_DESCRIPTIONS[voice_id]}"
     def update_emotion_preview(emotion_id):
-        emotions = ["Neutral", "Happy", "Sad", "Excited", "Frustrated"]
         return f"**Selected:** {emotions[emotion_id]}"
     voice_slider.change(
@@ -204,9 +308,9 @@ with gr.Blocks(title="Chinese TTS API for n8n", theme=gr.themes.Soft()) as demo:
         outputs=[audio_output, json_output]
     )
-# For API mode (used by n8n)
 async def api_generate(params):
-    """API endpoint for n8n"""
     text = params.get("text", "")
     voice_id = int(params.get("voice_id", 1))
     emotion_id = int(params.get("emotion_id", 0))
@@ -217,8 +321,11 @@ async def api_generate(params):
     if metadata["success"]:
         return {
             "status": "success",
-            "audio_url": f"/file={audio_path}",
-            "metadata": metadata
         }
     else:
         return {

 import os
 import json
 from pathlib import Path
+from huggingface_hub import HfApi, upload_file
+import uuid
+from datetime import datetime
+import shutil
+# Configuration
+HF_TOKEN = os.environ.get("HF_TOKEN")  # You'll set this in Space secrets
+DATASET_REPO = os.environ.get("DATASET_REPO", "YOUR_USERNAME/tts-audio-dataset")  # Your dataset name
+# Initialize Hugging Face API
+hf_api = HfApi(token=HF_TOKEN)
+# Chinese voice options
 VOICE_MAPPING = {
+    0: "zh-CN-XiaoxiaoNeural",    # Loyal Sister
+    1: "zh-CN-XiaoyiNeural",      # Sweet Voice
+    2: "zh-CN-YunjianNeural",     # Cool Voice
+    3: "zh-CN-XiaomengNeural",    # Loli Voice
+    4: "zh-CN-YunxiNeural",       # Professional
 }
 VOICE_DESCRIPTIONS = {
     0: "Loyal Sister (Xiaoxiao) - Warm, caring",
     1: "Sweet Voice (Xiaoyi) - Lively, cute",
     4: "Professional (Yunxi) - Clear, broadcast"
 }
 def get_emotion_params(emotion_id):
     """Convert emotion ID to speech parameters"""
     emotions = {
     }
     return emotions.get(emotion_id, emotions[0])
+def upload_to_dataset(audio_path, metadata):
     """
+    Upload audio file to Hugging Face dataset and return URL
     Args:
+        audio_path: Local path to audio file
+        metadata: Dictionary with generation metadata
+    Returns:
+        dict: Upload result with file URL
+    """
+    try:
+        # Generate unique filename
+        file_id = str(uuid.uuid4())[:8]
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        # Create filename with metadata
+        voice_name = VOICE_DESCRIPTIONS[metadata["voice_id"]].split(" ")[0]
+        emotion_names = ["neutral", "happy", "sad", "excited", "frustrated"]
+        emotion_name = emotion_names[metadata["emotion_id"]]
+        filename = f"tts_{timestamp}_{voice_name}_{emotion_name}_{file_id}.mp3"
+        # Path in dataset (organize by date)
+        date_path = datetime.now().strftime("%Y/%m/%d")
+        dataset_path = f"audio/{date_path}/{filename}"
+        # Upload file to dataset
+        upload_file(
+            path_or_fileobj=audio_path,
+            path_in_repo=dataset_path,
+            repo_id=DATASET_REPO,
+            repo_type="dataset",
+            token=HF_TOKEN
+        )
+        # Generate the raw file URL
+        file_url = f"https://huggingface.co/datasets/{DATASET_REPO}/resolve/main/{dataset_path}"
+        # Also create/update metadata JSON file
+        metadata_entry = {
+            "file_id": file_id,
+            "filename": filename,
+            "dataset_path": dataset_path,
+            "file_url": file_url,
+            "timestamp": timestamp,
+            "text": metadata["text"],
+            "voice_id": metadata["voice_id"],
+            "voice_name": voice_name,
+            "emotion_id": metadata["emotion_id"],
+            "emotion_name": emotion_name,
+            "speed": metadata["speed"],
+            "parameters": metadata["parameters"]
+        }
+        # Update metadata index (optional - stores all generations history)
+        metadata_filename = f"metadata/{date_path}/{file_id}.json"
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
+            json.dump(metadata_entry, f, indent=2)
+            temp_meta_path = f.name
+        # Upload metadata
+        upload_file(
+            path_or_fileobj=temp_meta_path,
+            path_in_repo=metadata_filename,
+            repo_id=DATASET_REPO,
+            repo_type="dataset",
+            token=HF_TOKEN
+        )
+        # Cleanup temp files
+        os.unlink(temp_meta_path)
+        return {
+            "success": True,
+            "file_url": file_url,
+            "dataset_path": dataset_path,
+            "filename": filename,
+            "metadata": metadata_entry
+        }
+    except Exception as e:
+        return {
+            "success": False,
+            "error": str(e)
+        }
+async def generate_speech(text, voice_id, emotion_id, speed=1.0):
+    """
+    Generate speech and save to dataset
+    Returns:
+        tuple: (local_audio_path, response_data)
     """
     try:
         # Get voice
         adjusted_rate = rate_percentage + int((speed - 1.0) * 50)
         rate = f"{adjusted_rate:+d}%"
+        # Create communicate object
         communicate = edge_tts.Communicate(
             text,
             voice,
         # Generate audio to temporary file
         temp_dir = tempfile.mkdtemp()
+        local_audio_path = os.path.join(temp_dir, "temp_audio.mp3")
+        await communicate.save(local_audio_path)
+        # Prepare metadata for dataset
+        metadata = {
+            "text": text,
             "voice_id": voice_id,
+            "voice_description": VOICE_DESCRIPTIONS[voice_id],
             "emotion_id": emotion_id,
             "speed": speed,
             "parameters": {
             }
         }
+        # Upload to dataset
+        upload_result = upload_to_dataset(local_audio_path, metadata)
+        # Cleanup temp directory
+        shutil.rmtree(temp_dir)
+        if upload_result["success"]:
+            # Return both local file (for immediate playback) and dataset URL
+            return local_audio_path, {
+                "success": True,
+                "message": "Audio generated and saved to dataset",
+                "audio_url": upload_result["file_url"],  # Permanent URL for n8n
+                "dataset_path": upload_result["dataset_path"],
+                "filename": upload_result["filename"],
+                "metadata": upload_result["metadata"],
+                "local_audio_available": True  # For web interface playback
+            }
+        else:
+            # If upload fails, still return local audio but with warning
+            return local_audio_path, {
+                "success": True,
+                "message": "Audio generated but failed to save to dataset",
+                "warning": upload_result["error"],
+                "audio_url": None,
+                "local_audio_available": True
+            }
     except Exception as e:
         return None, {
             "success": False,
     return audio_path, metadata
 # Create Gradio interface
+with gr.Blocks(title="Chinese TTS API with Dataset Storage", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
+    # 🎙️ Chinese TTS API with Hugging Face Dataset Storage
+    ### Generate speech and automatically save to dataset with permanent URL
+    ## 🔗 Dataset Integration
+    - Audio files are automatically saved to your Hugging Face dataset
+    - Returns permanent URL for use in n8n workflows
+    - Files organized by date in the dataset
     """)
     with gr.Row():
                 label="Speed"
             )
+            generate_btn = gr.Button("🎵 Generate & Save to Dataset", variant="primary", size="lg")
         with gr.Column(scale=1):
             audio_output = gr.Audio(
+                label="Generated Audio (Local)",
                 type="filepath"
             )
             json_output = gr.JSON(
+                label="Response Data (includes permanent dataset URL)"
             )
+            # Show dataset info
+            gr.Markdown(f"""
+            ### 📊 Dataset Info
+            - **Dataset:** `{DATASET_REPO}`
+            - Audio files saved to: `/audio/YYYY/MM/DD/`
+            - Metadata saved to: `/metadata/YYYY/MM/DD/`
             """)
+    # Update previews
     def update_voice_preview(voice_id):
         return f"**Selected:** {VOICE_DESCRIPTIONS[voice_id]}"
     def update_emotion_preview(emotion_id):
+        emotions = ["Neutral", "Happy", "Sad", "Exicted", "Frustrated"]
         return f"**Selected:** {emotions[emotion_id]}"
     voice_slider.change(
         outputs=[audio_output, json_output]
     )
+# API endpoint for n8n
 async def api_generate(params):
+    """API endpoint for n8n - returns permanent dataset URL"""
     text = params.get("text", "")
     voice_id = int(params.get("voice_id", 1))
     emotion_id = int(params.get("emotion_id", 0))
     if metadata["success"]:
         return {
             "status": "success",
+            "audio_url": metadata.get("audio_url"),  # Permanent dataset URL
+            "dataset_path": metadata.get("dataset_path"),
+            "filename": metadata.get("filename"),
+            "metadata": metadata.get("metadata"),
+            "message": metadata.get("message", "Audio generated successfully")
         }
     else:
         return {