Spaces:
Sleeping
Sleeping
Ko-TTS-Arena Contributors
commited on
Commit
·
407795c
1
Parent(s):
9c1f76a
feat: Add Gemini TTS (Aoede voice) - Google Cloud Text-to-Speech
Browse files- models.py +10 -0
- requirements.txt +2 -1
- tts.py +63 -0
models.py
CHANGED
|
@@ -566,6 +566,7 @@ def insert_initial_models():
|
|
| 566 |
has_clova = bool(os.getenv("CLOVA_CLIENT_ID") and os.getenv("CLOVA_API_KEY"))
|
| 567 |
has_humelo = bool(os.getenv("HUMELO_API_KEY"))
|
| 568 |
has_typecast = bool(os.getenv("TYPECAST_API_KEY"))
|
|
|
|
| 569 |
|
| 570 |
tts_models = [
|
| 571 |
# 채널톡 TTS (한국어 특화) - 항상 활성화
|
|
@@ -648,6 +649,15 @@ def insert_initial_models():
|
|
| 648 |
is_active=has_typecast,
|
| 649 |
model_url="https://typecast.ai/",
|
| 650 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 651 |
]
|
| 652 |
|
| 653 |
for model in tts_models:
|
|
|
|
| 566 |
has_clova = bool(os.getenv("CLOVA_CLIENT_ID") and os.getenv("CLOVA_API_KEY"))
|
| 567 |
has_humelo = bool(os.getenv("HUMELO_API_KEY"))
|
| 568 |
has_typecast = bool(os.getenv("TYPECAST_API_KEY"))
|
| 569 |
+
has_gemini_tts = bool(os.getenv("GEMINI_TTS_API_KEY"))
|
| 570 |
|
| 571 |
tts_models = [
|
| 572 |
# 채널톡 TTS (한국어 특화) - 항상 활성화
|
|
|
|
| 649 |
is_active=has_typecast,
|
| 650 |
model_url="https://typecast.ai/",
|
| 651 |
),
|
| 652 |
+
# Gemini TTS (Google Cloud - 다국어 지원) - API 키 있을 때만 활성화
|
| 653 |
+
Model(
|
| 654 |
+
id="gemini-tts-aoede",
|
| 655 |
+
name="Gemini TTS (Aoede)",
|
| 656 |
+
model_type=ModelType.TTS,
|
| 657 |
+
is_open=False,
|
| 658 |
+
is_active=has_gemini_tts,
|
| 659 |
+
model_url="https://cloud.google.com/text-to-speech/docs/gemini-tts",
|
| 660 |
+
),
|
| 661 |
]
|
| 662 |
|
| 663 |
for model in tts_models:
|
requirements.txt
CHANGED
|
@@ -14,4 +14,5 @@ huggingface-hub
|
|
| 14 |
scipy
|
| 15 |
numpy
|
| 16 |
pydub
|
| 17 |
-
typecast-python
|
|
|
|
|
|
| 14 |
scipy
|
| 15 |
numpy
|
| 16 |
pydub
|
| 17 |
+
typecast-python
|
| 18 |
+
google-cloud-texttospeech
|
tts.py
CHANGED
|
@@ -55,6 +55,9 @@ HUMELO_API_URL = "https://agitvxptajouhvoatxio.supabase.co/functions/v1/dive-syn
|
|
| 55 |
# Typecast TTS
|
| 56 |
TYPECAST_API_KEY = os.getenv("TYPECAST_API_KEY")
|
| 57 |
|
|
|
|
|
|
|
|
|
|
| 58 |
def resample_wav_to_16khz(input_path: str) -> str:
|
| 59 |
"""
|
| 60 |
Resample a WAV file to 16kHz for fair comparison.
|
|
@@ -213,6 +216,12 @@ model_mapping = {
|
|
| 213 |
"voice_id": "tc_5c789c337ad86500073a02cd",
|
| 214 |
"model": "ssfm-v21",
|
| 215 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
}
|
| 217 |
|
| 218 |
|
|
@@ -438,6 +447,52 @@ def predict_typecast_tts(text: str, voice_id: str = "tc_612ed01c7eb720fddd3ddedf
|
|
| 438 |
raise ValueError(f"Typecast TTS API 오류: {str(e)}")
|
| 439 |
|
| 440 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 441 |
def predict_google_tts(text: str, voice: str = "ko-KR-Wavenet-A") -> str:
|
| 442 |
"""Google Cloud TTS API 호출"""
|
| 443 |
api_key = os.getenv("GOOGLE_API_KEY")
|
|
@@ -538,6 +593,14 @@ def predict_tts(text: str, model: str) -> str:
|
|
| 538 |
)
|
| 539 |
# Typecast returns WAV
|
| 540 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 541 |
else:
|
| 542 |
raise ValueError(f"알 수 없는 provider: {provider}")
|
| 543 |
|
|
|
|
| 55 |
# Typecast TTS
|
| 56 |
TYPECAST_API_KEY = os.getenv("TYPECAST_API_KEY")
|
| 57 |
|
| 58 |
+
# Gemini TTS (Google Cloud)
|
| 59 |
+
GEMINI_TTS_API_KEY = os.getenv("GEMINI_TTS_API_KEY")
|
| 60 |
+
|
| 61 |
def resample_wav_to_16khz(input_path: str) -> str:
|
| 62 |
"""
|
| 63 |
Resample a WAV file to 16kHz for fair comparison.
|
|
|
|
| 216 |
"voice_id": "tc_5c789c337ad86500073a02cd",
|
| 217 |
"model": "ssfm-v21",
|
| 218 |
},
|
| 219 |
+
# Gemini TTS (Google Cloud - 다국어 지원)
|
| 220 |
+
"gemini-tts-aoede": {
|
| 221 |
+
"provider": "gemini",
|
| 222 |
+
"voice": "Aoede",
|
| 223 |
+
"model": "gemini-2.5-flash-tts",
|
| 224 |
+
},
|
| 225 |
}
|
| 226 |
|
| 227 |
|
|
|
|
| 447 |
raise ValueError(f"Typecast TTS API 오류: {str(e)}")
|
| 448 |
|
| 449 |
|
| 450 |
+
def predict_gemini_tts(text: str, voice: str = "Aoede", model: str = "gemini-2.5-flash-tts") -> str:
|
| 451 |
+
"""Gemini TTS API 호출 (Google Cloud Text-to-Speech)"""
|
| 452 |
+
api_key = GEMINI_TTS_API_KEY
|
| 453 |
+
if not api_key:
|
| 454 |
+
raise ValueError("GEMINI_TTS_API_KEY 환경 변수가 설정되지 않았습니다.")
|
| 455 |
+
|
| 456 |
+
try:
|
| 457 |
+
import os
|
| 458 |
+
os.environ['GOOGLE_API_KEY'] = api_key
|
| 459 |
+
|
| 460 |
+
from google.api_core.client_options import ClientOptions
|
| 461 |
+
from google.cloud import texttospeech_v1beta1 as texttospeech
|
| 462 |
+
|
| 463 |
+
client = texttospeech.TextToSpeechClient(
|
| 464 |
+
client_options=ClientOptions(api_endpoint='texttospeech.googleapis.com')
|
| 465 |
+
)
|
| 466 |
+
|
| 467 |
+
voice_params = texttospeech.VoiceSelectionParams(
|
| 468 |
+
name=voice,
|
| 469 |
+
language_code='ko-kr',
|
| 470 |
+
model_name=model
|
| 471 |
+
)
|
| 472 |
+
|
| 473 |
+
# Synthesize speech with natural prompt
|
| 474 |
+
response = client.synthesize_speech(
|
| 475 |
+
input=texttospeech.SynthesisInput(
|
| 476 |
+
text=text,
|
| 477 |
+
prompt='친절하고 자연스러운 톤으로 말해주세요'
|
| 478 |
+
),
|
| 479 |
+
voice=voice_params,
|
| 480 |
+
audio_config=texttospeech.AudioConfig(
|
| 481 |
+
audio_encoding=texttospeech.AudioEncoding.LINEAR16,
|
| 482 |
+
sample_rate_hertz=24000
|
| 483 |
+
),
|
| 484 |
+
)
|
| 485 |
+
|
| 486 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
| 487 |
+
f.write(response.audio_content)
|
| 488 |
+
return f.name
|
| 489 |
+
|
| 490 |
+
except ImportError:
|
| 491 |
+
raise ValueError("google-cloud-texttospeech 패키지가 설치되지 않았습니다.")
|
| 492 |
+
except Exception as e:
|
| 493 |
+
raise ValueError(f"Gemini TTS API 오류: {str(e)}")
|
| 494 |
+
|
| 495 |
+
|
| 496 |
def predict_google_tts(text: str, voice: str = "ko-KR-Wavenet-A") -> str:
|
| 497 |
"""Google Cloud TTS API 호출"""
|
| 498 |
api_key = os.getenv("GOOGLE_API_KEY")
|
|
|
|
| 593 |
)
|
| 594 |
# Typecast returns WAV
|
| 595 |
|
| 596 |
+
elif provider == "gemini":
|
| 597 |
+
audio_path = predict_gemini_tts(
|
| 598 |
+
text,
|
| 599 |
+
config.get("voice", "Aoede"),
|
| 600 |
+
config.get("model", "gemini-2.5-flash-tts"),
|
| 601 |
+
)
|
| 602 |
+
# Gemini TTS returns WAV at 24kHz
|
| 603 |
+
|
| 604 |
else:
|
| 605 |
raise ValueError(f"알 수 없는 provider: {provider}")
|
| 606 |
|