tbdavid2019's picture
台語tts
9eb576c
import io
import os
from pathlib import Path
from tempfile import NamedTemporaryFile
import time
import gradio as gr
import boto3
import requests
from openai import OpenAI
from pydub import AudioSegment
from dotenv import load_dotenv
from google import genai
from google.genai import types
# 加載環境變量
load_dotenv()
# 獲取 OpenAI API Key (如果在環境變量中設置了)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
# 標準音頻模型和聲音選項
STANDARD_AUDIO_MODELS = [
"gpt-4o-mini-tts",
"gpt-4o-audio-preview",
"tts-1",
"tts-1-hd",
]
STANDARD_VOICES = [
"alloy",
"echo",
"fable",
"onyx",
"nova",
"shimmer",
"coral",
"sage",
]
STANDARD_VOICE_NOTES = """
OpenAI 聲音備註:
- alloy: 中性平衡,對話感自然,通用場景。
- echo: 低沉男聲,較穩重,適合旁白或正式說明。
- fable: 溫暖敘事感,適合故事/有聲書。
- onyx: 清晰沉穩男聲,較正式,適合說明/主持。
- nova: 友好女聲,明亮自然,適合對話互動。
- shimmer: 柔和女聲,親切溫暖,適合客服/陪伴。
- coral: 活潑女聲,帶能量感,適合行銷/短視頻。
- sage: 成熟男聲,穩健理性,適合新聞/解說。
"""
GEMINI_MODEL_DEFAULT = "gemini-2.5-pro-preview-tts"
GEMINI_VOICES = [
"Puck",
"Charon",
"Fenrir",
"Alnilam",
"Aoede",
"Algieba",
]
GEMINI_SAMPLE_RATE = 24000
GEMINI_VOICE_NOTES = """
Gemini 聲音備註:
- Puck: 自然、中音、對話感強,適合一般對話。中文咬字清楚,外國腔較少。
- Charon: 低沉穩重、帶權威感,適合新聞播報/嚴肅公告/懸疑。
- Fenrir: 高亢有活力、語速偏快,適合遊戲旁白或激動解說。講中文時語速有時忽快忽慢,除非要激動效果,建議避開。
- Aoede: 建議女聲首選,中文咬字清楚、外國腔較少。
- Alnilam/Algieba: 舊版常見的名稱;在 gemini-2.5 系列建議優先用 Puck/Aoede/Charon/Fenrir。
中文建議:首選組合 Puck (男) + Aoede (女);若中文朗讀為主且要穩定,避免使用 Fenrir。
"""
POLLY_VOICE_DEFAULT = "Zhiyu"
POLLY_REGION_DEFAULT = os.getenv("AWS_REGION", "ap-northeast-1")
POLLY_VOICE_NOTES = "AWS Polly 中文目前僅女聲 Zhiyu,雙說話者將共用此聲音。需要 AWS Access Key / Secret / Region 才能使用。"
TAI_TTS_URL = "https://learn-language.tokyo/taigiTTS/taigi-text-to-speech"
TAI_TTS_MODEL_DEFAULT = "model6"
TAI_VOICE_NOTES = "台語 TTS (Taiwanese) 目前僅單一女聲,無需 API Key,模型預設 model6。雙說話者將共用同一聲音。"
# 優化腳本處理 - 合並相同說話者連續文本
def optimize_script(script):
print("🔄 開始優化腳本處理...")
lines = [line.strip() for line in script.splitlines() if line.strip()]
optimized = []
current_speaker = None
current_text = ""
for line in lines:
if line.lower().startswith("speaker-1:"):
speaker = "speaker-1"
text = line.split(":", 1)[1].strip()
elif line.lower().startswith("speaker-2:"):
speaker = "speaker-2"
text = line.split(":", 1)[1].strip()
else:
speaker = "speaker-1" # 默認使用說話者1
text = line
# 如果說話者變了,保存之前的文本並開始新的
if speaker != current_speaker and current_text:
optimized.append((current_speaker, current_text))
current_text = text
current_speaker = speaker
else:
# 相同說話者,合並文本(加空格)
if current_text:
current_text += " " + text
else:
current_text = text
current_speaker = speaker
# 添加最後一個說話者的文本
if current_text:
optimized.append((current_speaker, current_text))
print(f"✅ 腳本優化完成,共 {len(optimized)} 段對話")
return optimized
def get_mp3(text: str, voice: str, audio_model: str, audio_api_key: str, instructions: str = None) -> bytes:
"""使用 OpenAI TTS API 生成音頻"""
print(f"🎤 開始生成音頻: 長度 {len(text)} 字符, 聲音: {voice}, 模型: {audio_model}")
# 檢查文本長度,OpenAI TTS API 有 4096 個標記的限制
# 大約 1000 個漢字約等於 2000-3000 個標記,為安全起見,我們將限制設為 1000 個字符
MAX_TEXT_LENGTH = 1000
client = OpenAI(api_key=audio_api_key)
# 如果文本長度超過限制,分割文本
if len(text) > MAX_TEXT_LENGTH:
print(f"📝 文本過長 ({len(text)} 字符),分割成多個區塊")
# 將文本分割成更小的塊
text_chunks = []
for i in range(0, len(text), MAX_TEXT_LENGTH):
text_chunks.append(text[i:i + MAX_TEXT_LENGTH])
print(f"📦 共分割成 {len(text_chunks)} 個區塊")
# 為每個塊生成音頻並合並
combined_audio = b""
for i, chunk in enumerate(text_chunks, 1):
print(f"🔄 處理區塊 {i}/{len(text_chunks)}: {len(chunk)} 字符")
try:
# 構建 API 參數
api_params = {
"model": audio_model,
"voice": voice,
"input": chunk,
}
if instructions:
api_params["instructions"] = instructions
print(f"💬 使用語氣指示: {instructions}")
print(f"📡 調用 OpenAI TTS API...")
with client.audio.speech.with_streaming_response.create(**api_params) as response:
with io.BytesIO() as file:
for audio_chunk in response.iter_bytes():
file.write(audio_chunk)
chunk_audio = file.getvalue()
combined_audio += chunk_audio
print(f"✅ 區塊 {i} 生成完成: {len(chunk_audio)} bytes")
except Exception as e:
print(f"❌ 區塊 {i} 生成失敗: {e}")
raise
print(f"🎵 所有區塊合並完成,總大小: {len(combined_audio)} bytes")
return combined_audio
else:
# 原始邏輯,處理短文本
try:
# 構建 API 參數
api_params = {
"model": audio_model,
"voice": voice,
"input": text,
}
if instructions:
api_params["instructions"] = instructions
print(f"💬 使用語氣指示: {instructions}")
print(f"📡 調用 OpenAI TTS API...")
with client.audio.speech.with_streaming_response.create(**api_params) as response:
with io.BytesIO() as file:
for audio_chunk in response.iter_bytes():
file.write(audio_chunk)
audio_data = file.getvalue()
print(f"✅ 音頻生成完成: {len(audio_data)} bytes")
return audio_data
except Exception as e:
print(f"❌ 音頻生成失敗: {e}")
raise
def get_polly_mp3(text: str, polly_voice: str, polly_region: str, polly_access_key: str = None, polly_secret_key: str = None) -> bytes:
"""使用 AWS Polly 生成 MP3"""
print(f"🎤 Polly 生成音頻: 長度 {len(text)} 字符, 聲音: {polly_voice}, 區域: {polly_region}")
client_kwargs = {"region_name": polly_region or POLLY_REGION_DEFAULT}
if polly_access_key and polly_secret_key:
client_kwargs.update(
aws_access_key_id=polly_access_key,
aws_secret_access_key=polly_secret_key,
)
polly = boto3.client("polly", **client_kwargs)
try:
resp = polly.synthesize_speech(
Text=text,
OutputFormat="mp3",
VoiceId=polly_voice,
Engine="neural",
)
audio_bytes = resp["AudioStream"].read()
print(f"✅ Polly 音頻生成完成: {len(audio_bytes)} bytes")
return audio_bytes
except Exception as e:
print(f"❌ Polly 音頻生成失敗: {e}")
raise
def get_tai_tts_mp3(text: str, model: str = TAI_TTS_MODEL_DEFAULT) -> bytes:
"""使用台語 TTS 服務生成音頻,無需金鑰"""
print(f"🎤 台語 TTS 生成音頻: 長度 {len(text)} 字符, 模型: {model}")
try:
# 第一步:POST 取得 audio_url
resp = requests.post(
TAI_TTS_URL,
json={"text": text, "model": model},
headers={"content-type": "application/json", "origin": "https://learn-language.tokyo"},
timeout=60,
)
resp.raise_for_status()
result = resp.json()
audio_url = result.get("audio_url")
if not audio_url:
raise RuntimeError(f"台語 TTS 回應中缺少 audio_url: {result}")
print(f"🔗 取得音頻 URL: {audio_url}")
# 第二步:下載 WAV 音頻檔案
audio_resp = requests.get(audio_url, timeout=60)
audio_resp.raise_for_status()
audio_bytes = audio_resp.content
print(f"✅ 台語 TTS 音頻下載完成: {len(audio_bytes)} bytes (WAV 格式)")
return audio_bytes
except Exception as e:
print(f"❌ 台語 TTS 音頻生成失敗: {e}")
raise
def get_gemini_pcm(text: str, voice: str, gemini_model: str, gemini_api_key: str) -> bytes:
"""使用 Gemini TTS 生成原始 PCM 音頻 (24kHz mono)"""
if not gemini_api_key:
raise ValueError("缺少 Gemini API Key")
print(f"🎤 Gemini 生成音頻: 長度 {len(text)} 字符, 聲音: {voice}, 模型: {gemini_model}")
client = genai.Client(api_key=gemini_api_key)
config = types.GenerateContentConfig(
response_modalities=["audio"],
speech_config=types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=voice)
)
),
)
response = client.models.generate_content(
model=gemini_model,
contents=[types.Content(role="user", parts=[types.Part.from_text(text=text)])],
config=config,
)
if response.candidates:
for part in response.candidates[0].content.parts:
if part.inline_data and part.inline_data.data:
return part.inline_data.data
raise RuntimeError("未能取得 Gemini 音頻輸出")
def generate_audio_from_script(
script: str,
audio_api_key: str,
audio_model: str = "gpt-4o-mini-tts",
speaker1_voice: str = "onyx",
speaker2_voice: str = "nova",
volume_boost: float = 0,
speaker1_instructions: str = "保持活潑愉快的語氣",
speaker2_instructions: str = "保持活潑愉快的語氣",
) -> tuple[bytes, str]:
"""從腳本生成音頻,支持兩個說話者,並優化 API 調用"""
print("🎬 開始從腳本生成音頻")
print(f"📜 腳本總長度: {len(script)} 字符")
print(f"🎤 說話者聲音: 說話者1={speaker1_voice}, 說話者2={speaker2_voice}")
print(f"🔊 音量增強: {volume_boost} dB")
status_log = []
# 優化腳本處理
print("🔍 優化腳本內容...")
optimized_script = optimize_script(script)
print(f"✅ 腳本優化完成,共 {len(optimized_script)} 個片段")
# 使用 pydub 處理音頻合並
combined_segment = None
# 處理每一段
total_segments = len(optimized_script)
print(f"🎵 開始處理 {total_segments} 個音頻片段")
for i, (speaker, text) in enumerate(optimized_script, 1):
voice_to_use = speaker1_voice if speaker == "speaker-1" else speaker2_voice
instructions_to_use = speaker1_instructions if speaker == "speaker-1" else speaker2_instructions
print(f"🎭 處理片段 {i}/{total_segments}: {speaker} ({len(text)} 字符)")
status_log.append(f"[{speaker}] {text}")
try:
# 生成這一段的音頻
print(f"📡 生成 {speaker} 的音頻...")
audio_chunk = get_mp3(
text,
voice_to_use,
audio_model,
audio_api_key,
instructions_to_use
)
print(f"✅ {speaker} 音頻生成完成: {len(audio_chunk)} bytes")
# 將二進制數據轉換為 AudioSegment
with NamedTemporaryFile(suffix=".mp3", delete=False) as temp_file:
temp_file.write(audio_chunk)
temp_file_path = temp_file.name
# 讀取音頻
chunk_segment = AudioSegment.from_mp3(temp_file_path)
# 刪除臨時文件
os.unlink(temp_file_path)
# 合並音頻段
if combined_segment is None:
combined_segment = chunk_segment
print("🔗 創建第一個音頻片段")
else:
combined_segment += chunk_segment
print(f"🔗 已合並片段 {i}/{total_segments}")
except Exception as e:
error_msg = f"❌ 片段 {i} ({speaker}) 生成失敗: {str(e)}"
print(error_msg)
status_log.append(f"[錯誤] 無法生成音頻: {str(e)}")
raise
# 如果沒有生成任何音頻段
if combined_segment is None:
error_msg = "❌ 沒有生成任何音頻"
print(error_msg)
status_log.append("[錯誤] 沒有生成任何音頻")
return b"", "\n".join(status_log)
# 如果需要調整音量
if volume_boost > 0:
try:
print(f"🔊 調整音量 +{volume_boost} dB...")
# 調整音量
combined_segment = combined_segment + volume_boost # 增加音量 (dB)
status_log.append(f"[音量] 已增加 {volume_boost} dB")
print("✅ 音量調整完成")
except Exception as e:
warning_msg = f"⚠️ 音量調整失敗: {str(e)}"
print(warning_msg)
status_log.append(f"[警告] 音量調整失敗: {str(e)}")
# 將 AudioSegment 轉換為二進制數據
print("💾 導出最終音頻文件...")
output = io.BytesIO()
combined_segment.export(output, format="mp3")
combined_audio = output.getvalue()
print(f"🎉 腳本音頻生成完成!最終大小: {len(combined_audio)} bytes")
return combined_audio, "\n".join(status_log)
def generate_gemini_audio_from_script(
script: str,
gemini_api_key: str,
gemini_voice_speaker1: str = "Puck",
gemini_voice_speaker2: str = "Aoede",
gemini_model: str = GEMINI_MODEL_DEFAULT,
volume_boost: float = 0,
) -> tuple[bytes, str]:
print("🎬 開始使用 Gemini 從腳本生成音頻")
print(f"📜 腳本總長度: {len(script)} 字符")
print(f"🎤 聲音: 說話者1={gemini_voice_speaker1}, 說話者2={gemini_voice_speaker2}, 模型: {gemini_model}")
status_log = []
optimized_script = optimize_script(script)
print(f"✅ 腳本優化完成,共 {len(optimized_script)} 個片段")
combined_segment = None
total_segments = len(optimized_script)
print(f"🎵 開始處理 {total_segments} 個音頻片段 (Gemini)")
for i, (speaker, text) in enumerate(optimized_script, 1):
print(f"🎭 處理片段 {i}/{total_segments}: {speaker} ({len(text)} 字符)")
status_log.append(f"[Gemini][{speaker}] {text}")
try:
voice_to_use = gemini_voice_speaker1 if speaker == "speaker-1" else gemini_voice_speaker2
pcm_bytes = get_gemini_pcm(text, voice_to_use, gemini_model, gemini_api_key)
chunk_segment = AudioSegment(
data=pcm_bytes,
sample_width=2,
frame_rate=GEMINI_SAMPLE_RATE,
channels=1,
)
if combined_segment is None:
combined_segment = chunk_segment
print("🔗 創建第一個 Gemini 音頻片段")
else:
combined_segment += chunk_segment
print(f"🔗 已合並 Gemini 片段 {i}/{total_segments}")
except Exception as e:
error_msg = f"❌ Gemini 片段 {i} 生成失敗: {str(e)}"
print(error_msg)
status_log.append(f"[錯誤] 無法生成 Gemini 音頻: {str(e)}")
raise
if combined_segment is None:
error_msg = "❌ Gemini 沒有生成任何音頻"
print(error_msg)
status_log.append("[錯誤] 沒有生成任何音頻")
return b"", "\n".join(status_log)
if volume_boost > 0:
try:
print(f"🔊 調整音量 +{volume_boost} dB (Gemini)...")
combined_segment = combined_segment + volume_boost
status_log.append(f"[音量] 已增加 {volume_boost} dB")
print("✅ 音量調整完成 (Gemini)")
except Exception as e:
warning_msg = f"⚠️ 音量調整失敗 (Gemini): {str(e)}"
print(warning_msg)
status_log.append(f"[警告] 音量調整失敗: {str(e)}")
print("💾 導出 Gemini 最終音頻文件...")
output = io.BytesIO()
combined_segment.export(output, format="mp3")
combined_audio = output.getvalue()
print(f"🎉 Gemini 腳本音頻生成完成!最終大小: {len(combined_audio)} bytes")
return combined_audio, "\n".join(status_log)
def generate_polly_audio_from_script(
script: str,
polly_access_key: str,
polly_secret_key: str,
polly_region: str,
polly_voice: str = POLLY_VOICE_DEFAULT,
volume_boost: float = 0,
) -> tuple[bytes, str]:
print("🎬 開始使用 AWS Polly 從腳本生成音頻")
print(f"📜 腳本總長度: {len(script)} 字符")
print(f"🎤 聲音: {polly_voice}, 區域: {polly_region}")
status_log = []
optimized_script = optimize_script(script)
print(f"✅ 腳本優化完成,共 {len(optimized_script)} 個片段")
combined_segment = None
total_segments = len(optimized_script)
print(f"🎵 開始處理 {total_segments} 個音頻片段 (Polly)")
for i, (speaker, text) in enumerate(optimized_script, 1):
print(f"🎭 處理片段 {i}/{total_segments}: {speaker} ({len(text)} 字符)")
status_log.append(f"[Polly][{speaker}] {text}")
try:
audio_bytes = get_polly_mp3(text, polly_voice, polly_region, polly_access_key, polly_secret_key)
with NamedTemporaryFile(suffix=".mp3", delete=False) as temp_file:
temp_file.write(audio_bytes)
temp_file_path = temp_file.name
chunk_segment = AudioSegment.from_mp3(temp_file_path)
os.unlink(temp_file_path)
if combined_segment is None:
combined_segment = chunk_segment
print("🔗 創建第一個 Polly 音頻片段")
else:
combined_segment += chunk_segment
print(f"🔗 已合並 Polly 片段 {i}/{total_segments}")
except Exception as e:
error_msg = f"❌ Polly 片段 {i} 生成失敗: {str(e)}"
print(error_msg)
status_log.append(f"[錯誤] 無法生成 Polly 音頻: {str(e)}")
raise
if combined_segment is None:
error_msg = "❌ Polly 沒有生成任何音頻"
print(error_msg)
status_log.append("[錯誤] 沒有生成任何音頻")
return b"", "\n".join(status_log)
if volume_boost > 0:
try:
print(f"🔊 調整音量 +{volume_boost} dB (Polly)...")
combined_segment = combined_segment + volume_boost
status_log.append(f"[音量] 已增加 {volume_boost} dB")
print("✅ 音量調整完成 (Polly)")
except Exception as e:
warning_msg = f"⚠️ 音量調整失敗 (Polly): {str(e)}"
print(warning_msg)
status_log.append(f"[警告] 音量調整失敗: {str(e)}")
print("💾 導出 Polly 最終音頻文件...")
output = io.BytesIO()
combined_segment.export(output, format="mp3")
combined_audio = output.getvalue()
print(f"🎉 Polly 腳本音頻生成完成!最終大小: {len(combined_audio)} bytes")
return combined_audio, "\n".join(status_log)
def generate_tai_audio_from_script(
script: str,
tai_model: str = TAI_TTS_MODEL_DEFAULT,
volume_boost: float = 0,
) -> tuple[bytes, str]:
print("🎬 開始使用台語 TTS 從腳本生成音頻")
print(f"📜 腳本總長度: {len(script)} 字符")
print(f"🎤 模型: {tai_model} (僅單一女聲)")
status_log = []
optimized_script = optimize_script(script)
print(f"✅ 腳本優化完成,共 {len(optimized_script)} 個片段")
combined_segment = None
total_segments = len(optimized_script)
print(f"🎵 開始處理 {total_segments} 個音頻片段 (台語 TTS)")
for i, (speaker, text) in enumerate(optimized_script, 1):
print(f"🎭 處理片段 {i}/{total_segments}: {speaker} ({len(text)} 字符)")
status_log.append(f"[TaiTTS][{speaker}] {text}")
try:
audio_bytes = get_tai_tts_mp3(text, tai_model)
# 台語 TTS 回傳 WAV 格式,需用 from_wav
with NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
temp_file.write(audio_bytes)
temp_file_path = temp_file.name
chunk_segment = AudioSegment.from_wav(temp_file_path)
os.unlink(temp_file_path)
if combined_segment is None:
combined_segment = chunk_segment
print("🔗 創建第一個 台語 TTS 音頻片段")
else:
combined_segment += chunk_segment
print(f"🔗 已合並 台語 TTS 片段 {i}/{total_segments}")
except Exception as e:
error_msg = f"❌ 台語 TTS 片段 {i} 生成失敗: {str(e)}"
print(error_msg)
status_log.append(f"[錯誤] 無法生成 台語 TTS 音頻: {str(e)}")
raise
if combined_segment is None:
error_msg = "❌ 台語 TTS 沒有生成任何音頻"
print(error_msg)
status_log.append("[錯誤] 沒有生成任何音頻")
return b"", "\n".join(status_log)
if volume_boost > 0:
try:
print(f"🔊 調整音量 +{volume_boost} dB (台語 TTS)...")
combined_segment = combined_segment + volume_boost
status_log.append(f"[音量] 已增加 {volume_boost} dB")
print("✅ 音量調整完成 (台語 TTS)")
except Exception as e:
warning_msg = f"⚠️ 音量調整失敗 (台語 TTS): {str(e)}"
print(warning_msg)
status_log.append(f"[警告] 音量調整失敗: {str(e)}")
print("💾 導出 台語 TTS 最終音頻文件...")
output = io.BytesIO()
combined_segment.export(output, format="mp3")
combined_audio = output.getvalue()
print(f"🎉 台語 TTS 腳本音頻生成完成!最終大小: {len(combined_audio)} bytes")
return combined_audio, "\n".join(status_log)
def save_audio_file(audio_data: bytes) -> str:
"""將音頻數據保存為臨時文件"""
print("💾 開始保存音頻文件...")
temp_dir = Path("./temp_audio")
temp_dir.mkdir(exist_ok=True)
# 清理舊文件
old_files_count = 0
for old_file in temp_dir.glob("*.mp3"):
if old_file.stat().st_mtime < (time.time() - 24*60*60): # 24小時前的文件
old_file.unlink()
old_files_count += 1
if old_files_count > 0:
print(f"🧹 清理了 {old_files_count} 個舊的臨時文件")
# 創建新的臨時文件
temp_file = NamedTemporaryFile(
dir=temp_dir,
delete=False,
suffix=".mp3"
)
temp_file.write(audio_data)
temp_file.close()
print(f"✅ 音頻文件已保存: {temp_file.name} ({len(audio_data)} bytes)")
return temp_file.name
def process_and_save_audio(
script,
api_key,
gemini_api_key,
provider,
model,
voice1,
voice2,
volume_boost,
instr1,
instr2,
gemini_voice_speaker1,
gemini_voice_speaker2,
gemini_model,
polly_access_key,
polly_secret_key,
polly_region,
polly_voice,
tai_model,
):
"""處理音頻生成並保存文件,支持 OpenAI / Gemini / AWS Polly / 台語 TTS"""
try:
if provider == "Gemini TTS":
key_to_use = gemini_api_key or GEMINI_API_KEY
audio_data, status_log = generate_gemini_audio_from_script(
script,
key_to_use,
gemini_voice_speaker1,
gemini_voice_speaker2,
gemini_model,
volume_boost,
)
elif provider == "AWS Polly":
audio_data, status_log = generate_polly_audio_from_script(
script,
polly_access_key,
polly_secret_key,
polly_region or POLLY_REGION_DEFAULT,
polly_voice,
volume_boost,
)
elif provider == "Taiwanese TTS":
audio_data, status_log = generate_tai_audio_from_script(
script,
tai_model or TAI_TTS_MODEL_DEFAULT,
volume_boost,
)
else:
key_to_use = api_key or OPENAI_API_KEY
audio_data, status_log = generate_audio_from_script(
script,
key_to_use,
model,
voice1,
voice2,
volume_boost,
instr1,
instr2,
)
audio_path = save_audio_file(audio_data)
return audio_path, status_log
except Exception as e:
error_message = f"生成音頻時發生錯誤: {str(e)}"
print(error_message)
return None, error_message
def toggle_provider(selected_provider):
"""切換顯示 OpenAI/Gemini 專屬欄位"""
is_openai = selected_provider == "OpenAI TTS"
is_gemini = selected_provider == "Gemini TTS"
is_polly = selected_provider == "AWS Polly"
is_tai = selected_provider == "Taiwanese TTS"
return (
gr.update(visible=is_openai), # api_key
gr.update(visible=is_gemini), # gemini_api_key
gr.update(visible=is_openai), # audio_model
gr.update(visible=is_openai), # speaker1_voice
gr.update(visible=is_openai), # speaker2_voice
gr.update(visible=is_openai), # openai voice notes
gr.update(visible=is_gemini), # gemini_model
gr.update(visible=is_gemini), # gemini_voice_speaker1
gr.update(visible=is_gemini), # gemini_voice_speaker2
gr.update(visible=is_gemini), # gemini voice notes
gr.update(visible=is_polly), # polly access key
gr.update(visible=is_polly), # polly secret key
gr.update(visible=is_polly), # polly region
gr.update(visible=is_polly), # polly voice dropdown
gr.update(visible=is_polly), # polly notes
gr.update(visible=is_tai), # tai model
gr.update(visible=is_tai), # tai notes
)
# Gradio 界面
def create_gradio_interface():
with gr.Blocks(title="TTS Generator") as demo:
gr.Markdown("""
<style>
#header { text-align: center; margin-bottom: 20px; }
</style>
""")
gr.Markdown("# 語音合成器 | TTS Generator", elem_id="header")
with gr.Row():
with gr.Column(scale=1):
# 輸入區
script_input = gr.Textbox(
label="輸入腳本 | Input Script",
placeholder="""請粘貼腳本內容,格式如下:
speaker-1: 歡迎來到 David888 Podcast,我是 David...
speaker-2: 大家好,我是 Cordelia...
沒有標記說話者的行會默認使用說話者1的聲音。
提示:為提高效率,相同說話者的多行文字將自動合並處理。""",
lines=20
)
api_key = gr.Textbox(
label="OpenAI API Key",
type="password",
visible=True
)
gemini_api_key = gr.Textbox(
label="Gemini API Key",
type="password",
visible=False
)
polly_access_key = gr.Textbox(
label="AWS Access Key ID",
type="password",
visible=False
)
polly_secret_key = gr.Textbox(
label="AWS Secret Access Key",
type="password",
visible=False
)
polly_region = gr.Textbox(
label="AWS Region",
value=POLLY_REGION_DEFAULT,
visible=False
)
provider = gr.Radio(
label="TTS 服務 | Provider",
choices=["OpenAI TTS", "Gemini TTS", "AWS Polly", "Taiwanese TTS"],
value="OpenAI TTS"
)
with gr.Row():
audio_model = gr.Dropdown(
label="音頻模型 | Audio Model",
choices=STANDARD_AUDIO_MODELS,
value="gpt-4o-mini-tts",
visible=True
)
speaker1_voice = gr.Dropdown(
label="說話者1聲音 (男角) | Speaker 1 Voice (Male)",
choices=STANDARD_VOICES,
value="onyx",
visible=True
)
speaker2_voice = gr.Dropdown(
label="說話者2聲音 (女角) | Speaker 2 Voice (Female)",
choices=STANDARD_VOICES,
value="nova",
visible=True
)
openai_voice_notes = gr.Markdown(STANDARD_VOICE_NOTES, visible=True)
with gr.Row():
gemini_model = gr.Dropdown(
label="Gemini 模型 | Gemini Model",
choices=[GEMINI_MODEL_DEFAULT],
value=GEMINI_MODEL_DEFAULT,
visible=False
)
gemini_voice_speaker1 = gr.Dropdown(
label="Gemini 說話者1聲音 | Speaker 1 Voice",
choices=GEMINI_VOICES,
value="Puck",
visible=False
)
gemini_voice_speaker2 = gr.Dropdown(
label="Gemini 說話者2聲音 | Speaker 2 Voice",
choices=GEMINI_VOICES,
value="Aoede",
visible=False
)
gemini_voice_notes = gr.Markdown(GEMINI_VOICE_NOTES, visible=False)
with gr.Row():
polly_voice = gr.Dropdown(
label="Polly 聲音 (僅中文女聲)",
choices=[POLLY_VOICE_DEFAULT],
value=POLLY_VOICE_DEFAULT,
visible=False
)
polly_voice_notes = gr.Markdown(POLLY_VOICE_NOTES, visible=False)
with gr.Row():
tai_model = gr.Dropdown(
label="台語 TTS 模型 (單一女聲)",
choices=[TAI_TTS_MODEL_DEFAULT],
value=TAI_TTS_MODEL_DEFAULT,
visible=False
)
tai_voice_notes = gr.Markdown(TAI_VOICE_NOTES, visible=False)
with gr.Row():
speaker1_instructions = gr.Textbox(
label="說話者1語氣 | Speaker 1 Instructions",
value="保持活潑愉快的語氣",
placeholder="例如:保持活潑愉快的語氣、用專業嚴肅的口吻說話等",
lines=4
)
speaker2_instructions = gr.Textbox(
label="說話者2語氣 | Speaker 2 Instructions",
value="保持活潑愉快的語氣",
placeholder="例如:保持活潑愉快的語氣、用專業嚴肅的口吻說話等",
lines=4
)
volume_boost = gr.Slider(
label="音量增益 (dB) | Volume Boost (dB)",
minimum=0,
maximum=20,
value=6,
step=1,
info="增加音頻音量,單位為分貝(dB)。建議值:6-10 dB"
)
generate_button = gr.Button("生成音頻 | Generate Audio")
with gr.Column(scale=1):
# 輸出區
audio_output = gr.Audio(
label="生成的音頻 | Generated Audio",
type="filepath"
)
status_output = gr.Textbox(
label="生成日誌 | Generation Log",
lines=20
)
# 事件處理
generate_button.click(
fn=process_and_save_audio,
inputs=[
script_input,
api_key,
gemini_api_key,
provider,
audio_model,
speaker1_voice,
speaker2_voice,
volume_boost,
speaker1_instructions,
speaker2_instructions,
gemini_voice_speaker1,
gemini_voice_speaker2,
gemini_model,
polly_access_key,
polly_secret_key,
polly_region,
polly_voice,
tai_model,
],
outputs=[audio_output, status_output]
)
provider.change(
fn=toggle_provider,
inputs=provider,
outputs=[
api_key,
gemini_api_key,
audio_model,
speaker1_voice,
speaker2_voice,
openai_voice_notes,
gemini_model,
gemini_voice_speaker1,
gemini_voice_speaker2,
gemini_voice_notes,
polly_access_key,
polly_secret_key,
polly_region,
polly_voice,
polly_voice_notes,
tai_model,
tai_voice_notes,
],
)
return demo
demo = create_gradio_interface()
# Hugging Face Spaces expects a global `app` Gradio interface
app = demo.queue()
if __name__ == "__main__":
app.launch(server_name="0.0.0.0", server_port=7860)