Ko-TTS-Arena / tts.py
blackhole1218's picture
한국어 TTS 아레나 - Docker Space 배포
62f57ec
raw
history blame
6.2 kB
# 한국어 TTS Arena - TTS Router
import os
import json
import base64
import tempfile
import requests
from dotenv import load_dotenv
load_dotenv()
# 한국어 지원 TTS 제공자 매핑
# - 채널톡: 자체 API
# - ElevenLabs: 직접 API
# - OpenAI: API
# - Google: API
CHANNEL_TTS_URL = os.getenv(
"CHANNEL_TTS_URL",
"https://ch-tts-streaming-demo.channel.io/v1/text-to-speech"
)
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")
ELEVENLABS_VOICE_ID = os.getenv("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM") # Rachel (기본)
model_mapping = {
# 채널톡 TTS (한국어 특화)
"channel-hana": {
"provider": "channel",
"voice": "hana",
},
# ElevenLabs (다국어 지원) - 직접 API 호출
"eleven-multilingual-v2": {
"provider": "elevenlabs",
"model": "eleven_multilingual_v2",
},
# OpenAI TTS
"openai-tts-1": {
"provider": "openai",
"model": "tts-1",
"voice": "alloy",
},
"openai-tts-1-hd": {
"provider": "openai",
"model": "tts-1-hd",
"voice": "alloy",
},
# Google Cloud TTS
"google-wavenet": {
"provider": "google",
"voice": "ko-KR-Wavenet-A",
},
"google-neural2": {
"provider": "google",
"voice": "ko-KR-Neural2-A",
},
}
def predict_channel_tts(text: str, voice: str = "hana") -> str:
"""채널톡 TTS API 호출"""
url = f"{CHANNEL_TTS_URL}/{voice}"
response = requests.post(
url,
headers={"Content-Type": "application/json"},
json={"text": text, "output_format": "wav_24000"},
timeout=30,
)
response.raise_for_status()
# 임시 파일에 저장
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
f.write(response.content)
return f.name
def predict_elevenlabs_tts(text: str, model: str = "eleven_multilingual_v2") -> str:
"""ElevenLabs TTS API 직접 호출"""
api_key = ELEVENLABS_API_KEY
if not api_key:
raise ValueError("ELEVENLABS_API_KEY 환경 변수가 설정되지 않았습니다.")
voice_id = ELEVENLABS_VOICE_ID
response = requests.post(
f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}",
headers={
"xi-api-key": api_key,
"Content-Type": "application/json",
"Accept": "audio/mpeg",
},
json={
"text": text,
"model_id": model,
"voice_settings": {
"stability": 0.5,
"similarity_boost": 0.75,
},
},
timeout=60,
)
response.raise_for_status()
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as f:
f.write(response.content)
return f.name
def predict_openai_tts(text: str, model: str = "tts-1", voice: str = "alloy") -> str:
"""OpenAI TTS API 호출"""
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError("OPENAI_API_KEY 환경 변수가 설정되지 않았습니다.")
response = requests.post(
"https://api.openai.com/v1/audio/speech",
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
},
json={
"model": model,
"input": text,
"voice": voice,
"response_format": "wav",
},
timeout=60,
)
response.raise_for_status()
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
f.write(response.content)
return f.name
def predict_google_tts(text: str, voice: str = "ko-KR-Wavenet-A") -> str:
"""Google Cloud TTS API 호출"""
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
raise ValueError("GOOGLE_API_KEY 환경 변수가 설정되지 않았습니다.")
response = requests.post(
f"https://texttospeech.googleapis.com/v1/text:synthesize?key={api_key}",
headers={"Content-Type": "application/json"},
json={
"input": {"text": text},
"voice": {
"languageCode": "ko-KR",
"name": voice,
},
"audioConfig": {
"audioEncoding": "LINEAR16",
"sampleRateHertz": 24000,
},
},
timeout=30,
)
response.raise_for_status()
audio_content = response.json().get("audioContent")
if not audio_content:
raise ValueError("Google TTS API가 오디오를 반환하지 않았습니다.")
audio_bytes = base64.b64decode(audio_content)
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
f.write(audio_bytes)
return f.name
def predict_tts(text: str, model: str) -> str:
"""
TTS 생성 메인 함수
Args:
text: 합성할 텍스트
model: 모델 ID (model_mapping의 키)
Returns:
생성된 오디오 파일 경로
"""
print(f"[TTS] Predicting for model: {model}")
if model not in model_mapping:
raise ValueError(f"지원하지 않는 모델입니다: {model}")
config = model_mapping[model]
provider = config["provider"]
if provider == "channel":
return predict_channel_tts(text, config.get("voice", "hana"))
elif provider == "openai":
return predict_openai_tts(
text,
config.get("model", "tts-1"),
config.get("voice", "alloy"),
)
elif provider == "google":
return predict_google_tts(text, config.get("voice", "ko-KR-Wavenet-A"))
elif provider == "elevenlabs":
return predict_elevenlabs_tts(text, config.get("model", "eleven_multilingual_v2"))
else:
raise ValueError(f"알 수 없는 provider: {provider}")
if __name__ == "__main__":
# 테스트
test_text = "안녕하세요, 채널톡 TTS 테스트입니다."
print("Testing Channel TTS...")
try:
path = predict_channel_tts(test_text)
print(f" Success: {path}")
except Exception as e:
print(f" Error: {e}")