ILRDF-AI-Translator / audio_processor.py
ILRDF-Lowking's picture
Update audio_processor.py
df4782a verified
Raw
History Blame Contribute Delete
2.32 kB
import os
import shutil
from gradio_client import Client, handle_file
import config
# 初始化語音 API 連線
try:
# ASR 辨識引擎 (sapolita-kaldi)
asr_client = Client("https://ai-labs.ilrdf.org.tw/sapolita-kaldi/")
# TTS 合成引擎 (hnang-kari-ai-asi-sluhay)
tts_client = Client("https://ai-labs.ilrdf.org.tw/hnang-kari-ai-asi-sluhay/")
except Exception as e:
print(f"語音模組初始化失敗: {e}")
def get_clean_value(res):
"""資料清洗器:確保從 API 拿回來的結果是純文字"""
if isinstance(res, dict) and 'value' in res:
return res['value']
if isinstance(res, list) and len(res) > 0:
return res[0]
return res
def speech_to_text(audio_path, tribe_name):
"""
耳朵模組:將音檔轉為文字 (ASR)
供語音訊息與影片音軌辨識使用
"""
# 💡 從 config.py 的 TRIBE_MAP 取得該族語的 ASR 代碼 (如 formosan_ami)
asr_code = config.TRIBE_MAP.get(tribe_name, {}).get("asr_code", "formosan_ami")
try:
# 呼叫原語會辨識 API
result_raw = asr_client.predict(
dialect_id=asr_code,
audio_data=handle_file(audio_path),
api_name="/automatic_speech_recognition"
)
return get_clean_value(result_raw)
except Exception as e:
print(f"ASR 辨識失敗: {e}")
return None
def text_to_speech(text, tribe_name, filename):
"""
嘴巴模組:將文字轉為音檔 (TTS)
"""
os.makedirs("static", exist_ok=True)
save_path = f"static/{filename}.wav"
try:
# 1. 取得對應的發音人代碼
speaker = get_clean_value(tts_client.predict(ethnicity=tribe_name, api_name="/lambda"))
# 2. 如果是阿美語,強制指定特定的女聲 (維持 3.0 傳統)
if tribe_name == "阿美":
speaker = "阿美_秀姑巒_女聲1"
# 3. 執行合成
temp_file = tts_client.predict(
ref=speaker,
gen_text_input=text,
api_name="/default_speaker_tts"
)
# 4. 將暫存檔搬移到 static 資料夾
shutil.move(temp_file, save_path)
return save_path
except Exception as e:
print(f"TTS 合成失敗: {e}")
return None