Spaces:
Build error
Build error
| import io | |
| import os | |
| from pathlib import Path | |
| from tempfile import NamedTemporaryFile | |
| import time | |
| import gradio as gr | |
| import boto3 | |
| import requests | |
| from openai import OpenAI | |
| from pydub import AudioSegment | |
| from dotenv import load_dotenv | |
| from google import genai | |
| from google.genai import types | |
| # 加載環境變量 | |
| load_dotenv() | |
| # 獲取 OpenAI API Key (如果在環境變量中設置了) | |
| OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "") | |
| GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "") | |
| # 標準音頻模型和聲音選項 | |
| STANDARD_AUDIO_MODELS = [ | |
| "gpt-4o-mini-tts", | |
| "gpt-4o-audio-preview", | |
| "tts-1", | |
| "tts-1-hd", | |
| ] | |
| STANDARD_VOICES = [ | |
| "alloy", | |
| "echo", | |
| "fable", | |
| "onyx", | |
| "nova", | |
| "shimmer", | |
| "coral", | |
| "sage", | |
| ] | |
| STANDARD_VOICE_NOTES = """ | |
| OpenAI 聲音備註: | |
| - alloy: 中性平衡,對話感自然,通用場景。 | |
| - echo: 低沉男聲,較穩重,適合旁白或正式說明。 | |
| - fable: 溫暖敘事感,適合故事/有聲書。 | |
| - onyx: 清晰沉穩男聲,較正式,適合說明/主持。 | |
| - nova: 友好女聲,明亮自然,適合對話互動。 | |
| - shimmer: 柔和女聲,親切溫暖,適合客服/陪伴。 | |
| - coral: 活潑女聲,帶能量感,適合行銷/短視頻。 | |
| - sage: 成熟男聲,穩健理性,適合新聞/解說。 | |
| """ | |
| GEMINI_MODEL_DEFAULT = "gemini-2.5-pro-preview-tts" | |
| GEMINI_VOICES = [ | |
| "Puck", | |
| "Charon", | |
| "Fenrir", | |
| "Alnilam", | |
| "Aoede", | |
| "Algieba", | |
| ] | |
| GEMINI_SAMPLE_RATE = 24000 | |
| GEMINI_VOICE_NOTES = """ | |
| Gemini 聲音備註: | |
| - Puck: 自然、中音、對話感強,適合一般對話。中文咬字清楚,外國腔較少。 | |
| - Charon: 低沉穩重、帶權威感,適合新聞播報/嚴肅公告/懸疑。 | |
| - Fenrir: 高亢有活力、語速偏快,適合遊戲旁白或激動解說。講中文時語速有時忽快忽慢,除非要激動效果,建議避開。 | |
| - Aoede: 建議女聲首選,中文咬字清楚、外國腔較少。 | |
| - Alnilam/Algieba: 舊版常見的名稱;在 gemini-2.5 系列建議優先用 Puck/Aoede/Charon/Fenrir。 | |
| 中文建議:首選組合 Puck (男) + Aoede (女);若中文朗讀為主且要穩定,避免使用 Fenrir。 | |
| """ | |
| POLLY_VOICE_DEFAULT = "Zhiyu" | |
| POLLY_REGION_DEFAULT = os.getenv("AWS_REGION", "ap-northeast-1") | |
| POLLY_VOICE_NOTES = "AWS Polly 中文目前僅女聲 Zhiyu,雙說話者將共用此聲音。需要 AWS Access Key / Secret / Region 才能使用。" | |
| TAI_TTS_URL = "https://learn-language.tokyo/taigiTTS/taigi-text-to-speech" | |
| TAI_TTS_MODEL_DEFAULT = "model6" | |
| TAI_VOICE_NOTES = "台語 TTS (Taiwanese) 目前僅單一女聲,無需 API Key,模型預設 model6。雙說話者將共用同一聲音。" | |
| # 優化腳本處理 - 合並相同說話者連續文本 | |
| def optimize_script(script): | |
| print("🔄 開始優化腳本處理...") | |
| lines = [line.strip() for line in script.splitlines() if line.strip()] | |
| optimized = [] | |
| current_speaker = None | |
| current_text = "" | |
| for line in lines: | |
| if line.lower().startswith("speaker-1:"): | |
| speaker = "speaker-1" | |
| text = line.split(":", 1)[1].strip() | |
| elif line.lower().startswith("speaker-2:"): | |
| speaker = "speaker-2" | |
| text = line.split(":", 1)[1].strip() | |
| else: | |
| speaker = "speaker-1" # 默認使用說話者1 | |
| text = line | |
| # 如果說話者變了,保存之前的文本並開始新的 | |
| if speaker != current_speaker and current_text: | |
| optimized.append((current_speaker, current_text)) | |
| current_text = text | |
| current_speaker = speaker | |
| else: | |
| # 相同說話者,合並文本(加空格) | |
| if current_text: | |
| current_text += " " + text | |
| else: | |
| current_text = text | |
| current_speaker = speaker | |
| # 添加最後一個說話者的文本 | |
| if current_text: | |
| optimized.append((current_speaker, current_text)) | |
| print(f"✅ 腳本優化完成,共 {len(optimized)} 段對話") | |
| return optimized | |
| def get_mp3(text: str, voice: str, audio_model: str, audio_api_key: str, instructions: str = None) -> bytes: | |
| """使用 OpenAI TTS API 生成音頻""" | |
| print(f"🎤 開始生成音頻: 長度 {len(text)} 字符, 聲音: {voice}, 模型: {audio_model}") | |
| # 檢查文本長度,OpenAI TTS API 有 4096 個標記的限制 | |
| # 大約 1000 個漢字約等於 2000-3000 個標記,為安全起見,我們將限制設為 1000 個字符 | |
| MAX_TEXT_LENGTH = 1000 | |
| client = OpenAI(api_key=audio_api_key) | |
| # 如果文本長度超過限制,分割文本 | |
| if len(text) > MAX_TEXT_LENGTH: | |
| print(f"📝 文本過長 ({len(text)} 字符),分割成多個區塊") | |
| # 將文本分割成更小的塊 | |
| text_chunks = [] | |
| for i in range(0, len(text), MAX_TEXT_LENGTH): | |
| text_chunks.append(text[i:i + MAX_TEXT_LENGTH]) | |
| print(f"📦 共分割成 {len(text_chunks)} 個區塊") | |
| # 為每個塊生成音頻並合並 | |
| combined_audio = b"" | |
| for i, chunk in enumerate(text_chunks, 1): | |
| print(f"🔄 處理區塊 {i}/{len(text_chunks)}: {len(chunk)} 字符") | |
| try: | |
| # 構建 API 參數 | |
| api_params = { | |
| "model": audio_model, | |
| "voice": voice, | |
| "input": chunk, | |
| } | |
| if instructions: | |
| api_params["instructions"] = instructions | |
| print(f"💬 使用語氣指示: {instructions}") | |
| print(f"📡 調用 OpenAI TTS API...") | |
| with client.audio.speech.with_streaming_response.create(**api_params) as response: | |
| with io.BytesIO() as file: | |
| for audio_chunk in response.iter_bytes(): | |
| file.write(audio_chunk) | |
| chunk_audio = file.getvalue() | |
| combined_audio += chunk_audio | |
| print(f"✅ 區塊 {i} 生成完成: {len(chunk_audio)} bytes") | |
| except Exception as e: | |
| print(f"❌ 區塊 {i} 生成失敗: {e}") | |
| raise | |
| print(f"🎵 所有區塊合並完成,總大小: {len(combined_audio)} bytes") | |
| return combined_audio | |
| else: | |
| # 原始邏輯,處理短文本 | |
| try: | |
| # 構建 API 參數 | |
| api_params = { | |
| "model": audio_model, | |
| "voice": voice, | |
| "input": text, | |
| } | |
| if instructions: | |
| api_params["instructions"] = instructions | |
| print(f"💬 使用語氣指示: {instructions}") | |
| print(f"📡 調用 OpenAI TTS API...") | |
| with client.audio.speech.with_streaming_response.create(**api_params) as response: | |
| with io.BytesIO() as file: | |
| for audio_chunk in response.iter_bytes(): | |
| file.write(audio_chunk) | |
| audio_data = file.getvalue() | |
| print(f"✅ 音頻生成完成: {len(audio_data)} bytes") | |
| return audio_data | |
| except Exception as e: | |
| print(f"❌ 音頻生成失敗: {e}") | |
| raise | |
| def get_polly_mp3(text: str, polly_voice: str, polly_region: str, polly_access_key: str = None, polly_secret_key: str = None) -> bytes: | |
| """使用 AWS Polly 生成 MP3""" | |
| print(f"🎤 Polly 生成音頻: 長度 {len(text)} 字符, 聲音: {polly_voice}, 區域: {polly_region}") | |
| client_kwargs = {"region_name": polly_region or POLLY_REGION_DEFAULT} | |
| if polly_access_key and polly_secret_key: | |
| client_kwargs.update( | |
| aws_access_key_id=polly_access_key, | |
| aws_secret_access_key=polly_secret_key, | |
| ) | |
| polly = boto3.client("polly", **client_kwargs) | |
| try: | |
| resp = polly.synthesize_speech( | |
| Text=text, | |
| OutputFormat="mp3", | |
| VoiceId=polly_voice, | |
| Engine="neural", | |
| ) | |
| audio_bytes = resp["AudioStream"].read() | |
| print(f"✅ Polly 音頻生成完成: {len(audio_bytes)} bytes") | |
| return audio_bytes | |
| except Exception as e: | |
| print(f"❌ Polly 音頻生成失敗: {e}") | |
| raise | |
| def get_tai_tts_mp3(text: str, model: str = TAI_TTS_MODEL_DEFAULT) -> bytes: | |
| """使用台語 TTS 服務生成音頻,無需金鑰""" | |
| print(f"🎤 台語 TTS 生成音頻: 長度 {len(text)} 字符, 模型: {model}") | |
| try: | |
| # 第一步:POST 取得 audio_url | |
| resp = requests.post( | |
| TAI_TTS_URL, | |
| json={"text": text, "model": model}, | |
| headers={"content-type": "application/json", "origin": "https://learn-language.tokyo"}, | |
| timeout=60, | |
| ) | |
| resp.raise_for_status() | |
| result = resp.json() | |
| audio_url = result.get("audio_url") | |
| if not audio_url: | |
| raise RuntimeError(f"台語 TTS 回應中缺少 audio_url: {result}") | |
| print(f"🔗 取得音頻 URL: {audio_url}") | |
| # 第二步:下載 WAV 音頻檔案 | |
| audio_resp = requests.get(audio_url, timeout=60) | |
| audio_resp.raise_for_status() | |
| audio_bytes = audio_resp.content | |
| print(f"✅ 台語 TTS 音頻下載完成: {len(audio_bytes)} bytes (WAV 格式)") | |
| return audio_bytes | |
| except Exception as e: | |
| print(f"❌ 台語 TTS 音頻生成失敗: {e}") | |
| raise | |
| def get_gemini_pcm(text: str, voice: str, gemini_model: str, gemini_api_key: str) -> bytes: | |
| """使用 Gemini TTS 生成原始 PCM 音頻 (24kHz mono)""" | |
| if not gemini_api_key: | |
| raise ValueError("缺少 Gemini API Key") | |
| print(f"🎤 Gemini 生成音頻: 長度 {len(text)} 字符, 聲音: {voice}, 模型: {gemini_model}") | |
| client = genai.Client(api_key=gemini_api_key) | |
| config = types.GenerateContentConfig( | |
| response_modalities=["audio"], | |
| speech_config=types.SpeechConfig( | |
| voice_config=types.VoiceConfig( | |
| prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=voice) | |
| ) | |
| ), | |
| ) | |
| response = client.models.generate_content( | |
| model=gemini_model, | |
| contents=[types.Content(role="user", parts=[types.Part.from_text(text=text)])], | |
| config=config, | |
| ) | |
| if response.candidates: | |
| for part in response.candidates[0].content.parts: | |
| if part.inline_data and part.inline_data.data: | |
| return part.inline_data.data | |
| raise RuntimeError("未能取得 Gemini 音頻輸出") | |
| def generate_audio_from_script( | |
| script: str, | |
| audio_api_key: str, | |
| audio_model: str = "gpt-4o-mini-tts", | |
| speaker1_voice: str = "onyx", | |
| speaker2_voice: str = "nova", | |
| volume_boost: float = 0, | |
| speaker1_instructions: str = "保持活潑愉快的語氣", | |
| speaker2_instructions: str = "保持活潑愉快的語氣", | |
| ) -> tuple[bytes, str]: | |
| """從腳本生成音頻,支持兩個說話者,並優化 API 調用""" | |
| print("🎬 開始從腳本生成音頻") | |
| print(f"📜 腳本總長度: {len(script)} 字符") | |
| print(f"🎤 說話者聲音: 說話者1={speaker1_voice}, 說話者2={speaker2_voice}") | |
| print(f"🔊 音量增強: {volume_boost} dB") | |
| status_log = [] | |
| # 優化腳本處理 | |
| print("🔍 優化腳本內容...") | |
| optimized_script = optimize_script(script) | |
| print(f"✅ 腳本優化完成,共 {len(optimized_script)} 個片段") | |
| # 使用 pydub 處理音頻合並 | |
| combined_segment = None | |
| # 處理每一段 | |
| total_segments = len(optimized_script) | |
| print(f"🎵 開始處理 {total_segments} 個音頻片段") | |
| for i, (speaker, text) in enumerate(optimized_script, 1): | |
| voice_to_use = speaker1_voice if speaker == "speaker-1" else speaker2_voice | |
| instructions_to_use = speaker1_instructions if speaker == "speaker-1" else speaker2_instructions | |
| print(f"🎭 處理片段 {i}/{total_segments}: {speaker} ({len(text)} 字符)") | |
| status_log.append(f"[{speaker}] {text}") | |
| try: | |
| # 生成這一段的音頻 | |
| print(f"📡 生成 {speaker} 的音頻...") | |
| audio_chunk = get_mp3( | |
| text, | |
| voice_to_use, | |
| audio_model, | |
| audio_api_key, | |
| instructions_to_use | |
| ) | |
| print(f"✅ {speaker} 音頻生成完成: {len(audio_chunk)} bytes") | |
| # 將二進制數據轉換為 AudioSegment | |
| with NamedTemporaryFile(suffix=".mp3", delete=False) as temp_file: | |
| temp_file.write(audio_chunk) | |
| temp_file_path = temp_file.name | |
| # 讀取音頻 | |
| chunk_segment = AudioSegment.from_mp3(temp_file_path) | |
| # 刪除臨時文件 | |
| os.unlink(temp_file_path) | |
| # 合並音頻段 | |
| if combined_segment is None: | |
| combined_segment = chunk_segment | |
| print("🔗 創建第一個音頻片段") | |
| else: | |
| combined_segment += chunk_segment | |
| print(f"🔗 已合並片段 {i}/{total_segments}") | |
| except Exception as e: | |
| error_msg = f"❌ 片段 {i} ({speaker}) 生成失敗: {str(e)}" | |
| print(error_msg) | |
| status_log.append(f"[錯誤] 無法生成音頻: {str(e)}") | |
| raise | |
| # 如果沒有生成任何音頻段 | |
| if combined_segment is None: | |
| error_msg = "❌ 沒有生成任何音頻" | |
| print(error_msg) | |
| status_log.append("[錯誤] 沒有生成任何音頻") | |
| return b"", "\n".join(status_log) | |
| # 如果需要調整音量 | |
| if volume_boost > 0: | |
| try: | |
| print(f"🔊 調整音量 +{volume_boost} dB...") | |
| # 調整音量 | |
| combined_segment = combined_segment + volume_boost # 增加音量 (dB) | |
| status_log.append(f"[音量] 已增加 {volume_boost} dB") | |
| print("✅ 音量調整完成") | |
| except Exception as e: | |
| warning_msg = f"⚠️ 音量調整失敗: {str(e)}" | |
| print(warning_msg) | |
| status_log.append(f"[警告] 音量調整失敗: {str(e)}") | |
| # 將 AudioSegment 轉換為二進制數據 | |
| print("💾 導出最終音頻文件...") | |
| output = io.BytesIO() | |
| combined_segment.export(output, format="mp3") | |
| combined_audio = output.getvalue() | |
| print(f"🎉 腳本音頻生成完成!最終大小: {len(combined_audio)} bytes") | |
| return combined_audio, "\n".join(status_log) | |
| def generate_gemini_audio_from_script( | |
| script: str, | |
| gemini_api_key: str, | |
| gemini_voice_speaker1: str = "Puck", | |
| gemini_voice_speaker2: str = "Aoede", | |
| gemini_model: str = GEMINI_MODEL_DEFAULT, | |
| volume_boost: float = 0, | |
| ) -> tuple[bytes, str]: | |
| print("🎬 開始使用 Gemini 從腳本生成音頻") | |
| print(f"📜 腳本總長度: {len(script)} 字符") | |
| print(f"🎤 聲音: 說話者1={gemini_voice_speaker1}, 說話者2={gemini_voice_speaker2}, 模型: {gemini_model}") | |
| status_log = [] | |
| optimized_script = optimize_script(script) | |
| print(f"✅ 腳本優化完成,共 {len(optimized_script)} 個片段") | |
| combined_segment = None | |
| total_segments = len(optimized_script) | |
| print(f"🎵 開始處理 {total_segments} 個音頻片段 (Gemini)") | |
| for i, (speaker, text) in enumerate(optimized_script, 1): | |
| print(f"🎭 處理片段 {i}/{total_segments}: {speaker} ({len(text)} 字符)") | |
| status_log.append(f"[Gemini][{speaker}] {text}") | |
| try: | |
| voice_to_use = gemini_voice_speaker1 if speaker == "speaker-1" else gemini_voice_speaker2 | |
| pcm_bytes = get_gemini_pcm(text, voice_to_use, gemini_model, gemini_api_key) | |
| chunk_segment = AudioSegment( | |
| data=pcm_bytes, | |
| sample_width=2, | |
| frame_rate=GEMINI_SAMPLE_RATE, | |
| channels=1, | |
| ) | |
| if combined_segment is None: | |
| combined_segment = chunk_segment | |
| print("🔗 創建第一個 Gemini 音頻片段") | |
| else: | |
| combined_segment += chunk_segment | |
| print(f"🔗 已合並 Gemini 片段 {i}/{total_segments}") | |
| except Exception as e: | |
| error_msg = f"❌ Gemini 片段 {i} 生成失敗: {str(e)}" | |
| print(error_msg) | |
| status_log.append(f"[錯誤] 無法生成 Gemini 音頻: {str(e)}") | |
| raise | |
| if combined_segment is None: | |
| error_msg = "❌ Gemini 沒有生成任何音頻" | |
| print(error_msg) | |
| status_log.append("[錯誤] 沒有生成任何音頻") | |
| return b"", "\n".join(status_log) | |
| if volume_boost > 0: | |
| try: | |
| print(f"🔊 調整音量 +{volume_boost} dB (Gemini)...") | |
| combined_segment = combined_segment + volume_boost | |
| status_log.append(f"[音量] 已增加 {volume_boost} dB") | |
| print("✅ 音量調整完成 (Gemini)") | |
| except Exception as e: | |
| warning_msg = f"⚠️ 音量調整失敗 (Gemini): {str(e)}" | |
| print(warning_msg) | |
| status_log.append(f"[警告] 音量調整失敗: {str(e)}") | |
| print("💾 導出 Gemini 最終音頻文件...") | |
| output = io.BytesIO() | |
| combined_segment.export(output, format="mp3") | |
| combined_audio = output.getvalue() | |
| print(f"🎉 Gemini 腳本音頻生成完成!最終大小: {len(combined_audio)} bytes") | |
| return combined_audio, "\n".join(status_log) | |
| def generate_polly_audio_from_script( | |
| script: str, | |
| polly_access_key: str, | |
| polly_secret_key: str, | |
| polly_region: str, | |
| polly_voice: str = POLLY_VOICE_DEFAULT, | |
| volume_boost: float = 0, | |
| ) -> tuple[bytes, str]: | |
| print("🎬 開始使用 AWS Polly 從腳本生成音頻") | |
| print(f"📜 腳本總長度: {len(script)} 字符") | |
| print(f"🎤 聲音: {polly_voice}, 區域: {polly_region}") | |
| status_log = [] | |
| optimized_script = optimize_script(script) | |
| print(f"✅ 腳本優化完成,共 {len(optimized_script)} 個片段") | |
| combined_segment = None | |
| total_segments = len(optimized_script) | |
| print(f"🎵 開始處理 {total_segments} 個音頻片段 (Polly)") | |
| for i, (speaker, text) in enumerate(optimized_script, 1): | |
| print(f"🎭 處理片段 {i}/{total_segments}: {speaker} ({len(text)} 字符)") | |
| status_log.append(f"[Polly][{speaker}] {text}") | |
| try: | |
| audio_bytes = get_polly_mp3(text, polly_voice, polly_region, polly_access_key, polly_secret_key) | |
| with NamedTemporaryFile(suffix=".mp3", delete=False) as temp_file: | |
| temp_file.write(audio_bytes) | |
| temp_file_path = temp_file.name | |
| chunk_segment = AudioSegment.from_mp3(temp_file_path) | |
| os.unlink(temp_file_path) | |
| if combined_segment is None: | |
| combined_segment = chunk_segment | |
| print("🔗 創建第一個 Polly 音頻片段") | |
| else: | |
| combined_segment += chunk_segment | |
| print(f"🔗 已合並 Polly 片段 {i}/{total_segments}") | |
| except Exception as e: | |
| error_msg = f"❌ Polly 片段 {i} 生成失敗: {str(e)}" | |
| print(error_msg) | |
| status_log.append(f"[錯誤] 無法生成 Polly 音頻: {str(e)}") | |
| raise | |
| if combined_segment is None: | |
| error_msg = "❌ Polly 沒有生成任何音頻" | |
| print(error_msg) | |
| status_log.append("[錯誤] 沒有生成任何音頻") | |
| return b"", "\n".join(status_log) | |
| if volume_boost > 0: | |
| try: | |
| print(f"🔊 調整音量 +{volume_boost} dB (Polly)...") | |
| combined_segment = combined_segment + volume_boost | |
| status_log.append(f"[音量] 已增加 {volume_boost} dB") | |
| print("✅ 音量調整完成 (Polly)") | |
| except Exception as e: | |
| warning_msg = f"⚠️ 音量調整失敗 (Polly): {str(e)}" | |
| print(warning_msg) | |
| status_log.append(f"[警告] 音量調整失敗: {str(e)}") | |
| print("💾 導出 Polly 最終音頻文件...") | |
| output = io.BytesIO() | |
| combined_segment.export(output, format="mp3") | |
| combined_audio = output.getvalue() | |
| print(f"🎉 Polly 腳本音頻生成完成!最終大小: {len(combined_audio)} bytes") | |
| return combined_audio, "\n".join(status_log) | |
| def generate_tai_audio_from_script( | |
| script: str, | |
| tai_model: str = TAI_TTS_MODEL_DEFAULT, | |
| volume_boost: float = 0, | |
| ) -> tuple[bytes, str]: | |
| print("🎬 開始使用台語 TTS 從腳本生成音頻") | |
| print(f"📜 腳本總長度: {len(script)} 字符") | |
| print(f"🎤 模型: {tai_model} (僅單一女聲)") | |
| status_log = [] | |
| optimized_script = optimize_script(script) | |
| print(f"✅ 腳本優化完成,共 {len(optimized_script)} 個片段") | |
| combined_segment = None | |
| total_segments = len(optimized_script) | |
| print(f"🎵 開始處理 {total_segments} 個音頻片段 (台語 TTS)") | |
| for i, (speaker, text) in enumerate(optimized_script, 1): | |
| print(f"🎭 處理片段 {i}/{total_segments}: {speaker} ({len(text)} 字符)") | |
| status_log.append(f"[TaiTTS][{speaker}] {text}") | |
| try: | |
| audio_bytes = get_tai_tts_mp3(text, tai_model) | |
| # 台語 TTS 回傳 WAV 格式,需用 from_wav | |
| with NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: | |
| temp_file.write(audio_bytes) | |
| temp_file_path = temp_file.name | |
| chunk_segment = AudioSegment.from_wav(temp_file_path) | |
| os.unlink(temp_file_path) | |
| if combined_segment is None: | |
| combined_segment = chunk_segment | |
| print("🔗 創建第一個 台語 TTS 音頻片段") | |
| else: | |
| combined_segment += chunk_segment | |
| print(f"🔗 已合並 台語 TTS 片段 {i}/{total_segments}") | |
| except Exception as e: | |
| error_msg = f"❌ 台語 TTS 片段 {i} 生成失敗: {str(e)}" | |
| print(error_msg) | |
| status_log.append(f"[錯誤] 無法生成 台語 TTS 音頻: {str(e)}") | |
| raise | |
| if combined_segment is None: | |
| error_msg = "❌ 台語 TTS 沒有生成任何音頻" | |
| print(error_msg) | |
| status_log.append("[錯誤] 沒有生成任何音頻") | |
| return b"", "\n".join(status_log) | |
| if volume_boost > 0: | |
| try: | |
| print(f"🔊 調整音量 +{volume_boost} dB (台語 TTS)...") | |
| combined_segment = combined_segment + volume_boost | |
| status_log.append(f"[音量] 已增加 {volume_boost} dB") | |
| print("✅ 音量調整完成 (台語 TTS)") | |
| except Exception as e: | |
| warning_msg = f"⚠️ 音量調整失敗 (台語 TTS): {str(e)}" | |
| print(warning_msg) | |
| status_log.append(f"[警告] 音量調整失敗: {str(e)}") | |
| print("💾 導出 台語 TTS 最終音頻文件...") | |
| output = io.BytesIO() | |
| combined_segment.export(output, format="mp3") | |
| combined_audio = output.getvalue() | |
| print(f"🎉 台語 TTS 腳本音頻生成完成!最終大小: {len(combined_audio)} bytes") | |
| return combined_audio, "\n".join(status_log) | |
| def save_audio_file(audio_data: bytes) -> str: | |
| """將音頻數據保存為臨時文件""" | |
| print("💾 開始保存音頻文件...") | |
| temp_dir = Path("./temp_audio") | |
| temp_dir.mkdir(exist_ok=True) | |
| # 清理舊文件 | |
| old_files_count = 0 | |
| for old_file in temp_dir.glob("*.mp3"): | |
| if old_file.stat().st_mtime < (time.time() - 24*60*60): # 24小時前的文件 | |
| old_file.unlink() | |
| old_files_count += 1 | |
| if old_files_count > 0: | |
| print(f"🧹 清理了 {old_files_count} 個舊的臨時文件") | |
| # 創建新的臨時文件 | |
| temp_file = NamedTemporaryFile( | |
| dir=temp_dir, | |
| delete=False, | |
| suffix=".mp3" | |
| ) | |
| temp_file.write(audio_data) | |
| temp_file.close() | |
| print(f"✅ 音頻文件已保存: {temp_file.name} ({len(audio_data)} bytes)") | |
| return temp_file.name | |
| def process_and_save_audio( | |
| script, | |
| api_key, | |
| gemini_api_key, | |
| provider, | |
| model, | |
| voice1, | |
| voice2, | |
| volume_boost, | |
| instr1, | |
| instr2, | |
| gemini_voice_speaker1, | |
| gemini_voice_speaker2, | |
| gemini_model, | |
| polly_access_key, | |
| polly_secret_key, | |
| polly_region, | |
| polly_voice, | |
| tai_model, | |
| ): | |
| """處理音頻生成並保存文件,支持 OpenAI / Gemini / AWS Polly / 台語 TTS""" | |
| try: | |
| if provider == "Gemini TTS": | |
| key_to_use = gemini_api_key or GEMINI_API_KEY | |
| audio_data, status_log = generate_gemini_audio_from_script( | |
| script, | |
| key_to_use, | |
| gemini_voice_speaker1, | |
| gemini_voice_speaker2, | |
| gemini_model, | |
| volume_boost, | |
| ) | |
| elif provider == "AWS Polly": | |
| audio_data, status_log = generate_polly_audio_from_script( | |
| script, | |
| polly_access_key, | |
| polly_secret_key, | |
| polly_region or POLLY_REGION_DEFAULT, | |
| polly_voice, | |
| volume_boost, | |
| ) | |
| elif provider == "Taiwanese TTS": | |
| audio_data, status_log = generate_tai_audio_from_script( | |
| script, | |
| tai_model or TAI_TTS_MODEL_DEFAULT, | |
| volume_boost, | |
| ) | |
| else: | |
| key_to_use = api_key or OPENAI_API_KEY | |
| audio_data, status_log = generate_audio_from_script( | |
| script, | |
| key_to_use, | |
| model, | |
| voice1, | |
| voice2, | |
| volume_boost, | |
| instr1, | |
| instr2, | |
| ) | |
| audio_path = save_audio_file(audio_data) | |
| return audio_path, status_log | |
| except Exception as e: | |
| error_message = f"生成音頻時發生錯誤: {str(e)}" | |
| print(error_message) | |
| return None, error_message | |
| def toggle_provider(selected_provider): | |
| """切換顯示 OpenAI/Gemini 專屬欄位""" | |
| is_openai = selected_provider == "OpenAI TTS" | |
| is_gemini = selected_provider == "Gemini TTS" | |
| is_polly = selected_provider == "AWS Polly" | |
| is_tai = selected_provider == "Taiwanese TTS" | |
| return ( | |
| gr.update(visible=is_openai), # api_key | |
| gr.update(visible=is_gemini), # gemini_api_key | |
| gr.update(visible=is_openai), # audio_model | |
| gr.update(visible=is_openai), # speaker1_voice | |
| gr.update(visible=is_openai), # speaker2_voice | |
| gr.update(visible=is_openai), # openai voice notes | |
| gr.update(visible=is_gemini), # gemini_model | |
| gr.update(visible=is_gemini), # gemini_voice_speaker1 | |
| gr.update(visible=is_gemini), # gemini_voice_speaker2 | |
| gr.update(visible=is_gemini), # gemini voice notes | |
| gr.update(visible=is_polly), # polly access key | |
| gr.update(visible=is_polly), # polly secret key | |
| gr.update(visible=is_polly), # polly region | |
| gr.update(visible=is_polly), # polly voice dropdown | |
| gr.update(visible=is_polly), # polly notes | |
| gr.update(visible=is_tai), # tai model | |
| gr.update(visible=is_tai), # tai notes | |
| ) | |
| # Gradio 界面 | |
| def create_gradio_interface(): | |
| with gr.Blocks(title="TTS Generator") as demo: | |
| gr.Markdown(""" | |
| <style> | |
| #header { text-align: center; margin-bottom: 20px; } | |
| </style> | |
| """) | |
| gr.Markdown("# 語音合成器 | TTS Generator", elem_id="header") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| # 輸入區 | |
| script_input = gr.Textbox( | |
| label="輸入腳本 | Input Script", | |
| placeholder="""請粘貼腳本內容,格式如下: | |
| speaker-1: 歡迎來到 David888 Podcast,我是 David... | |
| speaker-2: 大家好,我是 Cordelia... | |
| 沒有標記說話者的行會默認使用說話者1的聲音。 | |
| 提示:為提高效率,相同說話者的多行文字將自動合並處理。""", | |
| lines=20 | |
| ) | |
| api_key = gr.Textbox( | |
| label="OpenAI API Key", | |
| type="password", | |
| visible=True | |
| ) | |
| gemini_api_key = gr.Textbox( | |
| label="Gemini API Key", | |
| type="password", | |
| visible=False | |
| ) | |
| polly_access_key = gr.Textbox( | |
| label="AWS Access Key ID", | |
| type="password", | |
| visible=False | |
| ) | |
| polly_secret_key = gr.Textbox( | |
| label="AWS Secret Access Key", | |
| type="password", | |
| visible=False | |
| ) | |
| polly_region = gr.Textbox( | |
| label="AWS Region", | |
| value=POLLY_REGION_DEFAULT, | |
| visible=False | |
| ) | |
| provider = gr.Radio( | |
| label="TTS 服務 | Provider", | |
| choices=["OpenAI TTS", "Gemini TTS", "AWS Polly", "Taiwanese TTS"], | |
| value="OpenAI TTS" | |
| ) | |
| with gr.Row(): | |
| audio_model = gr.Dropdown( | |
| label="音頻模型 | Audio Model", | |
| choices=STANDARD_AUDIO_MODELS, | |
| value="gpt-4o-mini-tts", | |
| visible=True | |
| ) | |
| speaker1_voice = gr.Dropdown( | |
| label="說話者1聲音 (男角) | Speaker 1 Voice (Male)", | |
| choices=STANDARD_VOICES, | |
| value="onyx", | |
| visible=True | |
| ) | |
| speaker2_voice = gr.Dropdown( | |
| label="說話者2聲音 (女角) | Speaker 2 Voice (Female)", | |
| choices=STANDARD_VOICES, | |
| value="nova", | |
| visible=True | |
| ) | |
| openai_voice_notes = gr.Markdown(STANDARD_VOICE_NOTES, visible=True) | |
| with gr.Row(): | |
| gemini_model = gr.Dropdown( | |
| label="Gemini 模型 | Gemini Model", | |
| choices=[GEMINI_MODEL_DEFAULT], | |
| value=GEMINI_MODEL_DEFAULT, | |
| visible=False | |
| ) | |
| gemini_voice_speaker1 = gr.Dropdown( | |
| label="Gemini 說話者1聲音 | Speaker 1 Voice", | |
| choices=GEMINI_VOICES, | |
| value="Puck", | |
| visible=False | |
| ) | |
| gemini_voice_speaker2 = gr.Dropdown( | |
| label="Gemini 說話者2聲音 | Speaker 2 Voice", | |
| choices=GEMINI_VOICES, | |
| value="Aoede", | |
| visible=False | |
| ) | |
| gemini_voice_notes = gr.Markdown(GEMINI_VOICE_NOTES, visible=False) | |
| with gr.Row(): | |
| polly_voice = gr.Dropdown( | |
| label="Polly 聲音 (僅中文女聲)", | |
| choices=[POLLY_VOICE_DEFAULT], | |
| value=POLLY_VOICE_DEFAULT, | |
| visible=False | |
| ) | |
| polly_voice_notes = gr.Markdown(POLLY_VOICE_NOTES, visible=False) | |
| with gr.Row(): | |
| tai_model = gr.Dropdown( | |
| label="台語 TTS 模型 (單一女聲)", | |
| choices=[TAI_TTS_MODEL_DEFAULT], | |
| value=TAI_TTS_MODEL_DEFAULT, | |
| visible=False | |
| ) | |
| tai_voice_notes = gr.Markdown(TAI_VOICE_NOTES, visible=False) | |
| with gr.Row(): | |
| speaker1_instructions = gr.Textbox( | |
| label="說話者1語氣 | Speaker 1 Instructions", | |
| value="保持活潑愉快的語氣", | |
| placeholder="例如:保持活潑愉快的語氣、用專業嚴肅的口吻說話等", | |
| lines=4 | |
| ) | |
| speaker2_instructions = gr.Textbox( | |
| label="說話者2語氣 | Speaker 2 Instructions", | |
| value="保持活潑愉快的語氣", | |
| placeholder="例如:保持活潑愉快的語氣、用專業嚴肅的口吻說話等", | |
| lines=4 | |
| ) | |
| volume_boost = gr.Slider( | |
| label="音量增益 (dB) | Volume Boost (dB)", | |
| minimum=0, | |
| maximum=20, | |
| value=6, | |
| step=1, | |
| info="增加音頻音量,單位為分貝(dB)。建議值:6-10 dB" | |
| ) | |
| generate_button = gr.Button("生成音頻 | Generate Audio") | |
| with gr.Column(scale=1): | |
| # 輸出區 | |
| audio_output = gr.Audio( | |
| label="生成的音頻 | Generated Audio", | |
| type="filepath" | |
| ) | |
| status_output = gr.Textbox( | |
| label="生成日誌 | Generation Log", | |
| lines=20 | |
| ) | |
| # 事件處理 | |
| generate_button.click( | |
| fn=process_and_save_audio, | |
| inputs=[ | |
| script_input, | |
| api_key, | |
| gemini_api_key, | |
| provider, | |
| audio_model, | |
| speaker1_voice, | |
| speaker2_voice, | |
| volume_boost, | |
| speaker1_instructions, | |
| speaker2_instructions, | |
| gemini_voice_speaker1, | |
| gemini_voice_speaker2, | |
| gemini_model, | |
| polly_access_key, | |
| polly_secret_key, | |
| polly_region, | |
| polly_voice, | |
| tai_model, | |
| ], | |
| outputs=[audio_output, status_output] | |
| ) | |
| provider.change( | |
| fn=toggle_provider, | |
| inputs=provider, | |
| outputs=[ | |
| api_key, | |
| gemini_api_key, | |
| audio_model, | |
| speaker1_voice, | |
| speaker2_voice, | |
| openai_voice_notes, | |
| gemini_model, | |
| gemini_voice_speaker1, | |
| gemini_voice_speaker2, | |
| gemini_voice_notes, | |
| polly_access_key, | |
| polly_secret_key, | |
| polly_region, | |
| polly_voice, | |
| polly_voice_notes, | |
| tai_model, | |
| tai_voice_notes, | |
| ], | |
| ) | |
| return demo | |
| demo = create_gradio_interface() | |
| # Hugging Face Spaces expects a global `app` Gradio interface | |
| app = demo.queue() | |
| if __name__ == "__main__": | |
| app.launch(server_name="0.0.0.0", server_port=7860) |