Spaces:

yukee1992
/

Tts-api-new

Paused

App Files Files Community

yukee1992 commited on Mar 5

Commit

be33088

verified ·

1 Parent(s): 77fe1f0

Update app.py

Browse files

Files changed (1) hide show

app.py +198 -72

app.py CHANGED Viewed

@@ -1,108 +1,234 @@
 import gradio as gr
-import ChatTTS
-import torch
-import torchaudio
-import numpy as np
 import os
 from pathlib import Path
-# Initialize model
-chat = ChatTTS.Chat()
-chat.load(compile=False)  # Set to True for better performance
-# Voice profile mapping (0-4)
-VOICE_PROFILES = {
-    0: "loyal_sister",    # Warm, caring tone
-    1: "sweet_voice",     # Gentle, melodic
-    2: "cool_voice",      # Calm, composed
-    3: "loli_voice",      # High-pitched, youthful
-    4: "professional",    # Neutral, clear
 }
-# Emotion control mapping [citation:2]
-EMOTION_CONTROLS = {
-    0: "[oral_0][laugh_0][break_0]",  # Neutral
-    1: "[oral_6][laugh_2][break_4]",  # Happy
-    2: "[oral_2][laugh_0][break_6]",  # Sad
-    3: "[oral_8][laugh_1][break_2]",  # Excited
-    4: "[oral_7][laugh_0][break_5]",  # Frustrated
 }
-def generate_speaker(voice_id):
-    """Generate consistent speaker embedding"""
-    seed_map = {0: 42, 1: 123, 2: 256, 3: 389, 4: 512}
-    torch.manual_seed(seed_map.get(voice_id, 42))
-    return chat.sample_random_speaker()
-def tts_generate(text, voice_id, emotion_id, speed=1.0):
-    """Generate speech with controlled voice and emotion"""
     try:
-        # Get speaker embedding
-        spk_emb = generate_speaker(voice_id)
-        # Configure inference parameters
-        params_infer_code = ChatTTS.Chat.InferCodeParams(
-            spk_emb=spk_emb,
-            temperature=0.3 * speed,
-            top_P=0.7,
-            top_K=20,
-        )
-        # Configure emotion
-        params_refine_text = ChatTTS.Chat.RefineTextParams(
-            prompt=EMOTION_CONTROLS.get(emotion_id, "[oral_0][laugh_0][break_0]"),
         )
-        # Generate speech
-        wavs = chat.infer([text],
-                         params_refine_text=params_refine_text,
-                         params_infer_code=params_infer_code)
-        # Save audio
-        audio_tensor = torch.from_numpy(wavs[0]).unsqueeze(0)
-        output_path = "output.wav"
-        torchaudio.save(output_path, audio_tensor, 24000)
         return output_path, {
             "success": True,
-            "voice": VOICE_PROFILES[voice_id],
-            "emotion": emotion_id,
-            "speed": speed
         }
     except Exception as e:
-        return None, {"success": False, "error": str(e)}
 # Create Gradio interface
-with gr.Blocks(title="Chinese TTS API", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
     # 🎙️ Chinese TTS API for n8n
-    Control voice and emotion via numeric parameters
     """)
     with gr.Row():
-        with gr.Column():
             text_input = gr.Textbox(
-                label="Text (支持中文)",
-                placeholder="输入中文文本...",
-                lines=3
             )
-            voice_id = gr.Slider(0, 4, step=1, value=1,
-                label="Voice ID (0: Sister, 1: Sweet, 2: Cool, 3: Loli, 4: Professional)")
-            emotion_id = gr.Slider(0, 4, step=1, value=0,
-                label="Emotion ID (0: Neutral, 1: Happy, 2: Sad, 3: Excited, 4: Frustrated)")
-            speed = gr.Slider(0.5, 2.0, step=0.1, value=1.0,
-                label="Speed")
-            generate_btn = gr.Button("Generate", variant="primary")
-        with gr.Column():
-            audio_output = gr.Audio(label="Generated Audio", type="filepath")
-            json_output = gr.JSON(label="Response")
     generate_btn.click(
-        fn=tts_generate,
-        inputs=[text_input, voice_id, emotion_id, speed],
         outputs=[audio_output, json_output]
     )
 if __name__ == "__main__":
-    demo.queue(max_size=50).launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
+import asyncio
+import edge_tts
+import tempfile
 import os
+import json
 from pathlib import Path
+# Chinese voice options with different characteristics
+VOICE_MAPPING = {
+    0: "zh-CN-XiaoxiaoNeural",    # Loyal Sister - Gentle, warm
+    1: "zh-CN-XiaoyiNeural",      # Sweet Voice - Lively, cute
+    2: "zh-CN-YunjianNeural",     # Cool Voice - Deep, calm
+    3: "zh-CN-XiaomengNeural",    # Loli Voice - Childish, energetic
+    4: "zh-CN-YunxiNeural",       # Professional - Clear, broadcast
 }
+# Voice style descriptions
+VOICE_DESCRIPTIONS = {
+    0: "Loyal Sister (Xiaoxiao) - Warm, caring",
+    1: "Sweet Voice (Xiaoyi) - Lively, cute",
+    2: "Cool Voice (Yunjian) - Deep, calm",
+    3: "Loli Voice (Xiaomeng) - Childish, energetic",
+    4: "Professional (Yunxi) - Clear, broadcast"
 }
+# Emotion mapping through speech rate and pitch
+def get_emotion_params(emotion_id):
+    """Convert emotion ID to speech parameters"""
+    emotions = {
+        0: {"rate": "+0%", "pitch": "+0Hz", "volume": "+0%"},  # Neutral
+        1: {"rate": "+15%", "pitch": "+30Hz", "volume": "+10%"},  # Happy
+        2: {"rate": "-10%", "pitch": "-20Hz", "volume": "-10%"},  # Sad
+        3: {"rate": "+25%", "pitch": "+50Hz", "volume": "+15%"},  # Excited
+        4: {"rate": "+5%", "pitch": "+15Hz", "volume": "+5%"},   # Frustrated
+    }
+    return emotions.get(emotion_id, emotions[0])
+async def generate_speech(text, voice_id, emotion_id, speed=1.0):
+    """
+    Generate speech using Edge TTS
+    Args:
+        text: Text to synthesize (Chinese or English)
+        voice_id: 0-4 for different voice types
+        emotion_id: 0-4 for different emotions
+        speed: Speech rate multiplier
+    """
     try:
+        # Get voice
+        voice = VOICE_MAPPING.get(voice_id, "zh-CN-XiaoxiaoNeural")
+        # Get emotion parameters
+        emotion_params = get_emotion_params(emotion_id)
+        # Adjust rate based on speed
+        rate_percentage = int(emotion_params["rate"].replace("%", "").replace("+", ""))
+        adjusted_rate = rate_percentage + int((speed - 1.0) * 50)
+        rate = f"{adjusted_rate:+d}%"
+        # Create communicate object with parameters
+        communicate = edge_tts.Communicate(
+            text,
+            voice,
+            rate=rate,
+            pitch=emotion_params["pitch"],
+            volume=emotion_params["volume"]
         )
+        # Generate audio to temporary file
+        temp_dir = tempfile.mkdtemp()
+        output_path = os.path.join(temp_dir, "output.mp3")
+        await communicate.save(output_path)
+        # Return audio file path and metadata
         return output_path, {
             "success": True,
+            "voice": VOICE_DESCRIPTIONS[voice_id],
+            "voice_id": voice_id,
+            "emotion_id": emotion_id,
+            "speed": speed,
+            "parameters": {
+                "rate": rate,
+                "pitch": emotion_params["pitch"],
+                "volume": emotion_params["volume"]
+            }
         }
     except Exception as e:
+        return None, {
+            "success": False,
+            "error": str(e)
+        }
+def tts_wrapper(text, voice_id, emotion_id, speed):
+    """Wrapper function to handle async"""
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    audio_path, metadata = loop.run_until_complete(
+        generate_speech(text, voice_id, emotion_id, speed)
+    )
+    return audio_path, metadata
 # Create Gradio interface
+with gr.Blocks(title="Chinese TTS API for n8n", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
     # 🎙️ Chinese TTS API for n8n
+    ### Stable Edge TTS backend with voice and emotion control
+    | Parameter | Range | Description |
+    |-----------|-------|-------------|
+    | Voice ID | 0-4 | Different voice characteristics |
+    | Emotion ID | 0-4 | Emotional expression |
+    | Speed | 0.5-2.0 | Speech rate |
     """)
     with gr.Row():
+        with gr.Column(scale=1):
             text_input = gr.Textbox(
+                label="📝 Text (支持中文/English)",
+                placeholder="输入要转换的文字...",
+                lines=4,
+                value="你好，欢迎使用语音合成服务。"
             )
+            with gr.Row():
+                voice_slider = gr.Slider(
+                    minimum=0, maximum=4, step=1, value=1,
+                    label="Voice ID (0-4)"
+                )
+                voice_preview = gr.Markdown("**Selected:** Sweet Voice (Xiaoyi)")
+            with gr.Row():
+                emotion_slider = gr.Slider(
+                    minimum=0, maximum=4, step=1, value=0,
+                    label="Emotion ID (0-4)"
+                )
+                emotion_preview = gr.Markdown("**Selected:** Neutral")
+            speed_slider = gr.Slider(
+                minimum=0.5, maximum=2.0, step=0.1, value=1.0,
+                label="Speed"
+            )
+            generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            audio_output = gr.Audio(
+                label="Generated Audio",
+                type="filepath"
+            )
+            json_output = gr.JSON(
+                label="Response Data (for n8n)"
+            )
+            # Voice reference table
+            gr.Markdown("""
+            ### Voice Reference
+            | ID | Voice | Description |
+            |----|-------|-------------|
+            | 0 | Xiaoxiao | Loyal Sister - Warm, caring |
+            | 1 | Xiaoyi | Sweet Voice - Lively, cute |
+            | 2 | Yunjian | Cool Voice - Deep, calm |
+            | 3 | Xiaomeng | Loli Voice - Childish |
+            | 4 | Yunxi | Professional - Clear |
+            ### Emotion Reference
+            | ID | Emotion | Effect |
+            |----|---------|--------|
+            | 0 | Neutral | Normal speech |
+            | 1 | Happy | Higher pitch, faster |
+            | 2 | Sad | Lower pitch, slower |
+            | 3 | Excited | High energy, fast |
+            | 4 | Frustrated | Tense, emphasized |
+            """)
+    # Update previews when sliders change
+    def update_voice_preview(voice_id):
+        return f"**Selected:** {VOICE_DESCRIPTIONS[voice_id]}"
+    def update_emotion_preview(emotion_id):
+        emotions = ["Neutral", "Happy", "Sad", "Excited", "Frustrated"]
+        return f"**Selected:** {emotions[emotion_id]}"
+    voice_slider.change(
+        fn=update_voice_preview,
+        inputs=voice_slider,
+        outputs=voice_preview
+    )
+    emotion_slider.change(
+        fn=update_emotion_preview,
+        inputs=emotion_slider,
+        outputs=emotion_preview
+    )
+    # Generate button click
     generate_btn.click(
+        fn=tts_wrapper,
+        inputs=[text_input, voice_slider, emotion_slider, speed_slider],
         outputs=[audio_output, json_output]
     )
+# For API mode (used by n8n)
+async def api_generate(params):
+    """API endpoint for n8n"""
+    text = params.get("text", "")
+    voice_id = int(params.get("voice_id", 1))
+    emotion_id = int(params.get("emotion_id", 0))
+    speed = float(params.get("speed", 1.0))
+    audio_path, metadata = await generate_speech(text, voice_id, emotion_id, speed)
+    if metadata["success"]:
+        return {
+            "status": "success",
+            "audio_url": f"/file={audio_path}",
+            "metadata": metadata
+        }
+    else:
+        return {
+            "status": "error",
+            "error": metadata["error"]
+        }
 if __name__ == "__main__":
+    demo.queue(max_size=50).launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True
+    )