EdgeTTS / app.py
Ryanus's picture
Update app.py
dcedafb verified
raw
history blame
8.36 kB
import asyncio
import edge_tts
import gradio as gr
import os
from datetime import datetime
from pydub import AudioSegment
AUDIO_DIR = "saved_audios"
PODCAST_DIR = "podcast_audios"
os.makedirs(AUDIO_DIR, exist_ok=True)
os.makedirs(PODCAST_DIR, exist_ok=True)
def generate_unique_filename(folder, prefix="audio", ext="mp3"):
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
return os.path.join(folder, f"{prefix}_{timestamp}.{ext}")
async def generate_speech(text, voice, rate, pitch, folder=AUDIO_DIR):
output_file = generate_unique_filename(folder)
communicate = edge_tts.Communicate(text, voice, rate=rate, pitch=pitch)
await communicate.save(output_file)
return output_file
async def get_voices():
voices = await edge_tts.list_voices()
return [voice["ShortName"] for voice in voices]
def list_saved_audios():
files = sorted(os.listdir(AUDIO_DIR), reverse=True)
return [os.path.join(AUDIO_DIR, f) for f in files if f.endswith(".mp3")]
def list_saved_podcasts():
files = sorted(os.listdir(PODCAST_DIR), reverse=True)
return [os.path.join(PODCAST_DIR, f) for f in files if f.endswith(".mp3")]
async def tts_interface(text, voice, rate_percentage, pitch_hz):
rate = f"{'+' if rate_percentage >= 0 else ''}{rate_percentage}%"
pitch = f"{'+' if pitch_hz >= 0 else ''}{pitch_hz}Hz"
audio_path = await generate_speech(text, voice, rate, pitch)
return audio_path
def play_saved_audio(audio_file):
return audio_file
async def podcast_produce(scripts, voice, rate_percentage, pitch_hz, bgm_file, podcast_title, podcast_desc):
rate = f"{'+' if rate_percentage >= 0 else ''}{rate_percentage}%"
pitch = f"{'+' if pitch_hz >= 0 else ''}{pitch_hz}Hz"
audio_segments = []
for idx, text in enumerate(scripts):
if text.strip():
temp_audio = generate_unique_filename(PODCAST_DIR, prefix=f"segment{idx}")
communicate = edge_tts.Communicate(text, voice, rate=rate, pitch=pitch)
await communicate.save(temp_audio)
audio_segments.append(AudioSegment.from_file(temp_audio))
os.remove(temp_audio)
if not audio_segments:
return None
podcast_audio = sum(audio_segments)
if bgm_file is not None and hasattr(bgm_file, "name") and os.path.isfile(bgm_file.name):
bgm = AudioSegment.from_file(bgm_file.name).apply_gain(-10)
bgm = bgm[:len(podcast_audio)]
podcast_audio = podcast_audio.overlay(bgm)
podcast_file = generate_unique_filename(PODCAST_DIR, prefix="podcast")
podcast_audio.export(podcast_file, format="mp3")
meta_file = podcast_file.replace(".mp3", ".txt")
with open(meta_file, "w", encoding="utf-8") as f:
f.write(f"Title: {podcast_title}\nDescription: {podcast_desc}\n")
return podcast_file
def clear_textbox():
return ""
def clear_paragraphs():
return [""]
def add_paragraph(paragraphs):
paragraphs = paragraphs.copy()
paragraphs.append("")
return paragraphs
def remove_paragraph(paragraphs):
paragraphs = paragraphs.copy()
if len(paragraphs) > 1:
paragraphs.pop()
return paragraphs
def update_paragraphs_ui(paragraphs):
# 回傳一組 Textbox 元件
return [gr.Textbox(value=p, label=f"段落{i+1}內容", lines=3, interactive=True) for i, p in enumerate(paragraphs)]
def collect_paragraphs(*args):
# 收集所有段落內容
return list(args)
async def main():
voices = await get_voices()
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("## 🎙️ Edge TTS 語音合成與播客製作\n\n- 多段腳本自由增減、內容可清空\n- 介面直覺、操作友善、檔案自動管理")
with gr.Tab("語音合成"):
with gr.Row():
text_input = gr.Textbox(lines=5, label="輸入文本")
clear_btn = gr.Button("清空")
voice_input = gr.Dropdown(voices, label="選擇語音", value="zh-CN-XiaoxiaoNeural")
rate_input = gr.Slider(-50, 50, value=0, step=1, label="語速調整 (%)")
pitch_input = gr.Slider(-50, 50, value=0, step=1, label="音高調整 (Hz)")
tts_btn = gr.Button("生成語音")
audio_output = gr.Audio(type="filepath", label="生成的語音")
tts_btn.click(
fn=tts_interface,
inputs=[text_input, voice_input, rate_input, pitch_input],
outputs=audio_output
)
clear_btn.click(fn=clear_textbox, outputs=text_input)
with gr.Tab("檢視已儲存語音"):
audio_files = gr.Dropdown(list_saved_audios(), label="選擇已儲存語音檔案", interactive=True)
saved_audio_output = gr.Audio(type="filepath", label="播放已儲存語音")
audio_files.change(fn=play_saved_audio, inputs=audio_files, outputs=saved_audio_output)
with gr.Tab("播客製作"):
gr.Markdown("### 📝 多段腳本輸入(可自由增減段落)")
paragraphs_state = gr.State([""])
paragraphs_container = gr.Column()
# 初始渲染
paragraph_boxes = update_paragraphs_ui([""])
for tb in paragraph_boxes:
paragraphs_container.append(tb)
add_btn = gr.Button("新增段落")
remove_btn = gr.Button("刪除段落")
clear_all_btn = gr.Button("全部清空")
def on_add(paragraphs):
new_paragraphs = add_paragraph(paragraphs)
return new_paragraphs, gr.update(components=update_paragraphs_ui(new_paragraphs))
def on_remove(paragraphs):
new_paragraphs = remove_paragraph(paragraphs)
return new_paragraphs, gr.update(components=update_paragraphs_ui(new_paragraphs))
def on_clear():
new_paragraphs = clear_paragraphs()
return new_paragraphs, gr.update(components=update_paragraphs_ui(new_paragraphs))
add_btn.click(on_add, inputs=paragraphs_state, outputs=[paragraphs_state, paragraphs_container])
remove_btn.click(on_remove, inputs=paragraphs_state, outputs=[paragraphs_state, paragraphs_container])
clear_all_btn.click(on_clear, outputs=[paragraphs_state, paragraphs_container])
# 參數設定
voice_input2 = gr.Dropdown(voices, label="選擇語音", value="zh-CN-XiaoxiaoNeural")
rate_input2 = gr.Slider(-50, 50, value=0, step=1, label="語速調整 (%)")
pitch_input2 = gr.Slider(-50, 50, value=0, step=1, label="音高調整 (Hz)")
bgm_input = gr.File(label="上傳背景音樂(可選)")
podcast_title = gr.Textbox(label="播客標題")
podcast_desc = gr.Textbox(label="播客描述")
podcast_btn = gr.Button("生成播客")
podcast_output = gr.Audio(type="filepath", label="生成的播客音檔")
def gather_scripts(*args):
return list(args)
def on_podcast_btn_click(*args):
# args: 段落內容 + 參數
n = len(paragraphs_state.value)
scripts = list(args[:n])
voice = args[n]
rate = args[n+1]
pitch = args[n+2]
bgm = args[n+3]
title = args[n+4]
desc = args[n+5]
return asyncio.run(podcast_produce(scripts, voice, rate, pitch, bgm, title, desc))
# 這裡需要用最新的段落數來組合 inputs
def get_inputs():
return [tb for tb in paragraphs_container.children] + [voice_input2, rate_input2, pitch_input2, bgm_input, podcast_title, podcast_desc]
podcast_btn.click(
fn=on_podcast_btn_click,
inputs=lambda: get_inputs(),
outputs=podcast_output
)
with gr.Tab("檢視已儲存播客"):
podcast_files = gr.Dropdown(list_saved_podcasts(), label="選擇已儲存播客檔案", interactive=True)
saved_podcast_output = gr.Audio(type="filepath", label="播放已儲存播客")
podcast_files.change(fn=play_saved_audio, inputs=podcast_files, outputs=saved_podcast_output)
demo.launch()
if __name__ == "__main__":
asyncio.run(main())