Spaces:

yukee1992
/

Tts-api-new

Paused

App Files Files Community

yukee1992 commited on Mar 6

Commit

3640d59

verified ·

1 Parent(s): 480a42a

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -197

app.py CHANGED Viewed

@@ -10,10 +10,19 @@ import uuid
 from datetime import datetime
 import shutil
 import re
 # Configuration
-HF_TOKEN = os.environ.get("HF_TOKEN")  # Set in Space secrets
-DATASET_REPO = os.environ.get("DATASET_REPO", "YOUR_USERNAME/video-media-dataset")  # Your dataset name
 # Initialize Hugging Face API
 hf_api = HfApi(token=HF_TOKEN)
@@ -35,9 +44,20 @@ VOICE_DESCRIPTIONS = {
     4: "Professional (Yunxi) - Clear, broadcast"
 }
 def sanitize_folder_name(title):
     """Convert video title to safe folder name"""
-    # Remove special characters and replace spaces with underscores
     safe_name = re.sub(r'[^\w\s-]', '', title)
     safe_name = re.sub(r'[-\s]+', '_', safe_name)
     return safe_name.strip('_')
@@ -53,34 +73,22 @@ def get_emotion_params(emotion_id):
     }
     return emotions.get(emotion_id, emotions[0])
-def upload_to_dataset(audio_path, metadata, video_title):
     """
     Upload audio file to Hugging Face dataset under video title folder
-    Args:
-        audio_path: Local path to audio file
-        metadata: Dictionary with generation metadata
-        video_title: Title of the video (used as folder name)
-    Returns:
-        dict: Upload result with file URL
     """
     try:
-        # Create safe folder name from video title
-        folder_name = sanitize_folder_name(video_title)
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         file_id = str(uuid.uuid4())[:8]
-        # Get voice and emotion info
         voice_name = VOICE_DESCRIPTIONS[metadata["voice_id"]].split(" ")[0]
         emotion_names = ["neutral", "happy", "sad", "excited", "frustrated"]
         emotion_name = emotion_names[metadata["emotion_id"]]
-        # Create filename: [timestamp]_[voice]_[emotion]_[fileid].mp3
         filename = f"{timestamp}_{voice_name}_{emotion_name}_{file_id}.mp3"
-        # Path in dataset: /[video_title]/audio/[filename]
-        dataset_path = f"{folder_name}/audio/{filename}"
         # Upload audio file to dataset
         upload_file(
@@ -91,10 +99,8 @@ def upload_to_dataset(audio_path, metadata, video_title):
             token=HF_TOKEN
         )
-        # Generate the raw file URL
         file_url = f"https://huggingface.co/datasets/{DATASET_REPO}/resolve/main/{dataset_path}"
-        # Create metadata entry
         metadata_entry = {
             "file_id": file_id,
             "type": "audio",
@@ -102,7 +108,7 @@ def upload_to_dataset(audio_path, metadata, video_title):
             "dataset_path": dataset_path,
             "file_url": file_url,
             "video_title": video_title,
-            "video_folder": folder_name,
             "timestamp": timestamp,
             "text": metadata["text"],
             "voice_id": metadata["voice_id"],
@@ -113,26 +119,12 @@ def upload_to_dataset(audio_path, metadata, video_title):
             "parameters": metadata["parameters"]
         }
-        # Update or create video metadata file (stores all assets for this video)
-        video_metadata_path = f"{folder_name}/metadata.json"
-        # Try to download existing metadata if it exists
-        existing_metadata = []
-        try:
-            # This is a simplified approach - in production you'd want to properly manage metadata
-            pass
-        except:
-            existing_metadata = []
-        # For now, we'll create a separate metadata file for each audio
-        # You can enhance this to maintain a single metadata file per video
-        audio_metadata_path = f"{folder_name}/metadata/audio_{file_id}.json"
         with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
             json.dump(metadata_entry, f, indent=2)
             temp_meta_path = f.name
-        # Upload audio metadata
         upload_file(
             path_or_fileobj=temp_meta_path,
             path_in_repo=audio_metadata_path,
@@ -141,7 +133,6 @@ def upload_to_dataset(audio_path, metadata, video_title):
             token=HF_TOKEN
         )
-        # Cleanup temp files
         os.unlink(temp_meta_path)
         return {
@@ -149,7 +140,7 @@ def upload_to_dataset(audio_path, metadata, video_title):
             "file_url": file_url,
             "dataset_path": dataset_path,
             "filename": filename,
-            "video_folder": folder_name,
             "metadata": metadata_entry
         }
@@ -159,26 +150,18 @@ def upload_to_dataset(audio_path, metadata, video_title):
             "error": str(e)
         }
-async def generate_speech(text, voice_id, emotion_id, speed, video_title):
     """
-    Generate speech and save to dataset under video title folder
-    Returns:
-        tuple: (local_audio_path, response_data)
     """
     try:
-        # Get voice
         voice = VOICE_MAPPING.get(voice_id, "zh-CN-XiaoxiaoNeural")
-        # Get emotion parameters
         emotion_params = get_emotion_params(emotion_id)
-        # Adjust rate based on speed
         rate_percentage = int(emotion_params["rate"].replace("%", "").replace("+", ""))
         adjusted_rate = rate_percentage + int((speed - 1.0) * 50)
         rate = f"{adjusted_rate:+d}%"
-        # Create communicate object
         communicate = edge_tts.Communicate(
             text,
             voice,
@@ -187,13 +170,11 @@ async def generate_speech(text, voice_id, emotion_id, speed, video_title):
             volume=emotion_params["volume"]
         )
-        # Generate audio to temporary file
         temp_dir = tempfile.mkdtemp()
         local_audio_path = os.path.join(temp_dir, "temp_audio.mp3")
         await communicate.save(local_audio_path)
-        # Prepare metadata for dataset
         metadata = {
             "text": text,
             "voice_id": voice_id,
@@ -207,187 +188,112 @@ async def generate_speech(text, voice_id, emotion_id, speed, video_title):
             }
         }
-        # Upload to dataset under video title folder
-        upload_result = upload_to_dataset(local_audio_path, metadata, video_title)
-        # Cleanup temp directory
         shutil.rmtree(temp_dir)
         if upload_result["success"]:
-            return local_audio_path, {
                 "success": True,
-                "message": f"Audio generated and saved to dataset under folder: {video_title}",
                 "video_title": video_title,
-                "video_folder": upload_result["video_folder"],
                 "audio_url": upload_result["file_url"],
                 "dataset_path": upload_result["dataset_path"],
                 "filename": upload_result["filename"],
-                "metadata": upload_result["metadata"],
-                "local_audio_available": True
             }
         else:
-            return local_audio_path, {
-                "success": True,
-                "message": "Audio generated but failed to save to dataset",
-                "warning": upload_result["error"],
-                "audio_url": None,
-                "local_audio_available": True
             }
     except Exception as e:
-        return None, {
             "success": False,
             "error": str(e)
         }
-def tts_wrapper(text, voice_id, emotion_id, speed, video_title):
-    """Wrapper function to handle async"""
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-    audio_path, metadata = loop.run_until_complete(
-        generate_speech(text, voice_id, emotion_id, speed, video_title)
-    )
-    return audio_path, metadata
-# Create Gradio interface
-with gr.Blocks(title="TTS with Dataset Storage by Video Title", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("""
-    # 🎙️ TTS API with Hugging Face Dataset Storage
-    ### Audio files organized by video title folders
-    ## 📁 Dataset Structure
-    ```
-    your-dataset/
-    ├── [Video_Title_1]/
-    │   ├── audio/
-    │   │   ├── 20240115_143022_Xiaoyi_happy_a1b2.mp3
-    │   │   └── 20240115_143145_Xiaoxiao_neutral_e5f6.mp3
-    │   └── metadata/
-    │       ├── audio_a1b2.json
-    │       └── audio_e5f6.json
-    ├── [Video_Title_2]/
-    │   ├── audio/
-    │   │   └── 20240115_144512_Yunjian_excited_g7h8.mp3
-    │   └── metadata/
-    │       └── audio_g7h8.json
-    └── images/  (for future image storage)
-        └── [Video_Title]/
-            └── thumbnail.jpg
-    ```
-    """)
     with gr.Row():
         with gr.Column(scale=1):
             video_title_input = gr.Textbox(
-                label="🎬 Video Title (used as folder name)",
                 placeholder="Enter video title...",
-                value="My Awesome Video",
-                info="This will create a folder with this name in the dataset"
             )
             text_input = gr.Textbox(
                 label="📝 Text to synthesize",
                 placeholder="输入中文或English...",
                 lines=3,
                 value="你好，欢迎使用语音合成服务。"
             )
-            with gr.Row():
-                voice_slider = gr.Slider(
-                    minimum=0, maximum=4, step=1, value=1,
-                    label="Voice ID (0-4)"
-                )
-                voice_preview = gr.Markdown("**Selected:** Sweet Voice (Xiaoyi)")
-            with gr.Row():
-                emotion_slider = gr.Slider(
-                    minimum=0, maximum=4, step=1, value=0,
-                    label="Emotion ID (0-4)"
-                )
-                emotion_preview = gr.Markdown("**Selected:** Neutral")
-            speed_slider = gr.Slider(
-                minimum=0.5, maximum=2.0, step=0.1, value=1.0,
-                label="Speed"
-            )
-            generate_btn = gr.Button("🎵 Generate & Save to Video Folder", variant="primary", size="lg")
         with gr.Column(scale=1):
-            audio_output = gr.Audio(
-                label="Generated Audio",
-                type="filepath"
-            )
-            json_output = gr.JSON(
-                label="Response Data (includes dataset URL)"
-            )
-            # Show dataset structure preview
-            gr.Markdown(f"""
-            ### 📊 Dataset Info
-            - **Dataset:** `{DATASET_REPO}`
-            - **Structure:** `/[Video Title]/audio/[file].mp3`
-            - **Metadata:** `/[Video Title]/metadata/[file_id].json`
-            """)
-    # Update previews
-    def update_voice_preview(voice_id):
-        return f"**Selected:** {VOICE_DESCRIPTIONS[voice_id]}"
-    def update_emotion_preview(emotion_id):
-        emotions = ["Neutral", "Happy", "Sad", "Excited", "Frustrated"]
-        return f"**Selected:** {emotions[emotion_id]}"
-    voice_slider.change(
-        fn=update_voice_preview,
-        inputs=voice_slider,
-        outputs=voice_preview
-    )
-    emotion_slider.change(
-        fn=update_emotion_preview,
-        inputs=emotion_slider,
-        outputs=emotion_preview
-    )
-    # Generate button click
     generate_btn.click(
-        fn=tts_wrapper,
-        inputs=[text_input, voice_slider, emotion_slider, speed_slider, video_title_input],
         outputs=[audio_output, json_output]
     )
-# API endpoint for n8n
-async def api_generate(params):
-    """API endpoint for n8n - returns permanent dataset URL"""
-    text = params.get("text", "")
-    voice_id = int(params.get("voice_id", 1))
-    emotion_id = int(params.get("emotion_id", 0))
-    speed = float(params.get("speed", 1.0))
-    video_title = params.get("video_title", "Untitled Video")
-    audio_path, metadata = await generate_speech(text, voice_id, emotion_id, speed, video_title)
-    if metadata["success"]:
-        return {
-            "status": "success",
-            "video_title": metadata.get("video_title"),
-            "video_folder": metadata.get("video_folder"),
-            "audio_url": metadata.get("audio_url"),
-            "dataset_path": metadata.get("dataset_path"),
-            "filename": metadata.get("filename"),
-            "metadata": metadata.get("metadata"),
-            "message": metadata.get("message")
-        }
-    else:
-        return {
-            "status": "error",
-            "error": metadata["error"]
-        }
 if __name__ == "__main__":
-    demo.queue(max_size=50).launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        show_error=True
-    )

 from datetime import datetime
 import shutil
 import re
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+import uvicorn
 # Configuration
+HF_TOKEN = os.environ.get("HF_TOKEN")
+DATASET_REPO = os.environ.get("DATASET_REPO", "yukee1992/video-project-images")  # Use same dataset as images
+print("=" * 60)
+print("🚀 STARTING TTS SERVICE WITH API")
+print("=" * 60)
+print(f"📦 HF Dataset: {DATASET_REPO}")
+print(f"🔑 HF Token: {'✅ Set' if HF_TOKEN else '❌ Missing'}")
 # Initialize Hugging Face API
 hf_api = HfApi(token=HF_TOKEN)
     4: "Professional (Yunxi) - Clear, broadcast"
 }
+# Create FastAPI app
+fastapi_app = FastAPI(title="TTS API")
+# Add CORS middleware
+fastapi_app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
 def sanitize_folder_name(title):
     """Convert video title to safe folder name"""
     safe_name = re.sub(r'[^\w\s-]', '', title)
     safe_name = re.sub(r'[-\s]+', '_', safe_name)
     return safe_name.strip('_')
     }
     return emotions.get(emotion_id, emotions[0])
+def upload_to_dataset(audio_path, metadata, video_title, project_id=None):
     """
     Upload audio file to Hugging Face dataset under video title folder
     """
     try:
+        # Use project_id if provided, otherwise use video_title
+        folder_name = project_id if project_id else sanitize_folder_name(video_title)
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         file_id = str(uuid.uuid4())[:8]
         voice_name = VOICE_DESCRIPTIONS[metadata["voice_id"]].split(" ")[0]
         emotion_names = ["neutral", "happy", "sad", "excited", "frustrated"]
         emotion_name = emotion_names[metadata["emotion_id"]]
         filename = f"{timestamp}_{voice_name}_{emotion_name}_{file_id}.mp3"
+        dataset_path = f"data/projects/{folder_name}/audio/{filename}"
         # Upload audio file to dataset
         upload_file(
             token=HF_TOKEN
         )
         file_url = f"https://huggingface.co/datasets/{DATASET_REPO}/resolve/main/{dataset_path}"
         metadata_entry = {
             "file_id": file_id,
             "type": "audio",
             "dataset_path": dataset_path,
             "file_url": file_url,
             "video_title": video_title,
+            "project_id": folder_name,
             "timestamp": timestamp,
             "text": metadata["text"],
             "voice_id": metadata["voice_id"],
             "parameters": metadata["parameters"]
         }
+        # Upload metadata
+        audio_metadata_path = f"data/projects/{folder_name}/metadata/audio_{file_id}.json"
         with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
             json.dump(metadata_entry, f, indent=2)
             temp_meta_path = f.name
         upload_file(
             path_or_fileobj=temp_meta_path,
             path_in_repo=audio_metadata_path,
             token=HF_TOKEN
         )
         os.unlink(temp_meta_path)
         return {
             "file_url": file_url,
             "dataset_path": dataset_path,
             "filename": filename,
+            "project_id": folder_name,
             "metadata": metadata_entry
         }
             "error": str(e)
         }
+async def generate_speech(text, voice_id, emotion_id, speed, video_title, project_id=None):
     """
+    Generate speech and save to dataset
     """
     try:
         voice = VOICE_MAPPING.get(voice_id, "zh-CN-XiaoxiaoNeural")
         emotion_params = get_emotion_params(emotion_id)
         rate_percentage = int(emotion_params["rate"].replace("%", "").replace("+", ""))
         adjusted_rate = rate_percentage + int((speed - 1.0) * 50)
         rate = f"{adjusted_rate:+d}%"
         communicate = edge_tts.Communicate(
             text,
             voice,
             volume=emotion_params["volume"]
         )
         temp_dir = tempfile.mkdtemp()
         local_audio_path = os.path.join(temp_dir, "temp_audio.mp3")
         await communicate.save(local_audio_path)
         metadata = {
             "text": text,
             "voice_id": voice_id,
             }
         }
+        upload_result = upload_to_dataset(local_audio_path, metadata, video_title, project_id)
         shutil.rmtree(temp_dir)
         if upload_result["success"]:
+            return {
                 "success": True,
+                "message": f"Audio generated and saved to dataset",
                 "video_title": video_title,
+                "project_id": upload_result["project_id"],
                 "audio_url": upload_result["file_url"],
                 "dataset_path": upload_result["dataset_path"],
                 "filename": upload_result["filename"],
+                "metadata": upload_result["metadata"]
             }
         else:
+            return {
+                "success": False,
+                "error": upload_result["error"]
             }
     except Exception as e:
+        return {
             "success": False,
             "error": str(e)
         }
+# =============================================
+# FASTAPI ENDPOINTS FOR n8n
+# =============================================
+@fastapi_app.get("/")
+async def root():
+    return {
+        "name": "TTS API",
+        "endpoints": {
+            "generate": "POST /api/generate",
+            "health": "GET /api/health"
+        }
+    }
+@fastapi_app.get("/api/health")
+async def health():
+    return {"status": "healthy", "service": "tts"}
+@fastapi_app.post("/api/generate")
+async def generate_tts(request: dict):
+    """API endpoint for n8n - returns permanent dataset URL"""
+    try:
+        text = request.get("text", "")
+        voice_id = int(request.get("voice_id", 1))
+        emotion_id = int(request.get("emotion_id", 0))
+        speed = float(request.get("speed", 1.0))
+        video_title = request.get("video_title", "Untitled Video")
+        project_id = request.get("project_id")  # Optional project ID from n8n
+        if not text:
+            return {"status": "error", "error": "No text provided"}
+        result = await generate_speech(text, voice_id, emotion_id, speed, video_title, project_id)
+        return result
+    except Exception as e:
+        return {"status": "error", "error": str(e)}
+# =============================================
+# GRADIO INTERFACE
+# =============================================
+with gr.Blocks(title="TTS with Dataset Storage") as demo:
+    gr.Markdown("# 🎙️ TTS API with Hugging Face Dataset Storage")
     with gr.Row():
         with gr.Column(scale=1):
             video_title_input = gr.Textbox(
+                label="🎬 Video Title",
                 placeholder="Enter video title...",
+                value="My Video"
+            )
+            project_id_input = gr.Textbox(
+                label="📁 Project ID (optional)",
+                placeholder="Enter project ID if known..."
             )
             text_input = gr.Textbox(
                 label="📝 Text to synthesize",
                 placeholder="输入中文或English...",
                 lines=3,
                 value="你好，欢迎使用语音合成服务。"
             )
+            voice_slider = gr.Slider(minimum=0, maximum=4, step=1, value=1, label="Voice ID")
+            emotion_slider = gr.Slider(minimum=0, maximum=4, step=1, value=0, label="Emotion ID")
+            speed_slider = gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="Speed")
+            generate_btn = gr.Button("🎵 Generate", variant="primary")
         with gr.Column(scale=1):
+            audio_output = gr.Audio(label="Generated Audio", type="filepath")
+            json_output = gr.JSON(label="Response Data")
     generate_btn.click(
+        fn=lambda t, v, e, s, vt, p: asyncio.run(generate_speech(t, v, e, s, vt, p)),
+        inputs=[text_input, voice_slider, emotion_slider, speed_slider, video_title_input, project_id_input],
         outputs=[audio_output, json_output]
     )
+# =============================================
+# MAIN - Mount Gradio to FastAPI
+# =============================================
+app = gr.mount_gradio_app(fastapi_app, demo, path="/")
 if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)