Spaces:
Runtime error
Runtime error
Ko-TTS-Arena Contributors commited on
Commit ·
c571e18
1
Parent(s): ee853d8
Add Cartesia Sonic 3 TTS with Japanese Kora voice
Browse files
models.py
CHANGED
|
@@ -637,6 +637,7 @@ def insert_initial_models():
|
|
| 637 |
has_typecast = bool(os.getenv("TYPECAST_API_KEY"))
|
| 638 |
has_clova = bool(os.getenv("CLOVA_CLIENT_ID")) and bool(os.getenv("CLOVA_API_KEY"))
|
| 639 |
has_narakeet = bool(os.getenv("NARAKEET_API_KEY"))
|
|
|
|
| 640 |
|
| 641 |
tts_models = [
|
| 642 |
# ElevenLabs (多言語対応) - API キーがある時のみ活性化
|
|
@@ -693,6 +694,15 @@ def insert_initial_models():
|
|
| 693 |
is_active=has_narakeet,
|
| 694 |
model_url="https://www.narakeet.com/",
|
| 695 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 696 |
]
|
| 697 |
|
| 698 |
for model in tts_models:
|
|
|
|
| 637 |
has_typecast = bool(os.getenv("TYPECAST_API_KEY"))
|
| 638 |
has_clova = bool(os.getenv("CLOVA_CLIENT_ID")) and bool(os.getenv("CLOVA_API_KEY"))
|
| 639 |
has_narakeet = bool(os.getenv("NARAKEET_API_KEY"))
|
| 640 |
+
has_cartesia = bool(os.getenv("CARTESIA_API_KEY"))
|
| 641 |
|
| 642 |
tts_models = [
|
| 643 |
# ElevenLabs (多言語対応) - API キーがある時のみ活性化
|
|
|
|
| 694 |
is_active=has_narakeet,
|
| 695 |
model_url="https://www.narakeet.com/",
|
| 696 |
),
|
| 697 |
+
# Cartesia TTS (日本語対応) - Sonic 3 Kora voice
|
| 698 |
+
Model(
|
| 699 |
+
id="cartesia-sonic3",
|
| 700 |
+
name="Cartesia Sonic 3",
|
| 701 |
+
model_type=ModelType.TTS,
|
| 702 |
+
is_open=False,
|
| 703 |
+
is_active=has_cartesia,
|
| 704 |
+
model_url="https://cartesia.ai/",
|
| 705 |
+
),
|
| 706 |
]
|
| 707 |
|
| 708 |
for model in tts_models:
|
tts.py
CHANGED
|
@@ -58,6 +58,9 @@ TYPECAST_API_KEY = os.getenv("TYPECAST_API_KEY")
|
|
| 58 |
# Narakeet TTS
|
| 59 |
NARAKEET_API_KEY = os.getenv("NARAKEET_API_KEY")
|
| 60 |
|
|
|
|
|
|
|
|
|
|
| 61 |
# Gemini TTS (Google Cloud) - 서비스 계정 JSON 필요 (API Key 미지원)
|
| 62 |
GOOGLE_APPLICATION_CREDENTIALS_JSON = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
|
| 63 |
if GOOGLE_APPLICATION_CREDENTIALS_JSON:
|
|
@@ -216,6 +219,13 @@ model_mapping = {
|
|
| 216 |
"provider": "narakeet",
|
| 217 |
"voice": "kaori", # 日本語女性音声
|
| 218 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
}
|
| 220 |
|
| 221 |
|
|
@@ -555,6 +565,46 @@ def predict_narakeet_tts(text: str, voice: str = "kaori") -> str:
|
|
| 555 |
return f.name
|
| 556 |
|
| 557 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 558 |
def predict_gemini_2_5_flash_tts(text: str, voice: str = "Kore") -> str:
|
| 559 |
"""Gemini 2.5 Flash Preview TTS API 호출 (API Key 방식)"""
|
| 560 |
if not GEMINI_API_KEY:
|
|
@@ -691,6 +741,15 @@ def predict_tts(text: str, model: str) -> str:
|
|
| 691 |
audio_path = predict_narakeet_tts(text, config.get("voice", "kaori"))
|
| 692 |
is_mp3 = True # Narakeet returns MP3
|
| 693 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 694 |
else:
|
| 695 |
raise ValueError(f"不明なprovider: {provider}")
|
| 696 |
|
|
|
|
| 58 |
# Narakeet TTS
|
| 59 |
NARAKEET_API_KEY = os.getenv("NARAKEET_API_KEY")
|
| 60 |
|
| 61 |
+
# Cartesia TTS
|
| 62 |
+
CARTESIA_API_KEY = os.getenv("CARTESIA_API_KEY")
|
| 63 |
+
|
| 64 |
# Gemini TTS (Google Cloud) - 서비스 계정 JSON 필요 (API Key 미지원)
|
| 65 |
GOOGLE_APPLICATION_CREDENTIALS_JSON = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
|
| 66 |
if GOOGLE_APPLICATION_CREDENTIALS_JSON:
|
|
|
|
| 219 |
"provider": "narakeet",
|
| 220 |
"voice": "kaori", # 日本語女性音声
|
| 221 |
},
|
| 222 |
+
# Cartesia TTS (日本語対応) - Sonic 3 model
|
| 223 |
+
"cartesia-sonic3": {
|
| 224 |
+
"provider": "cartesia",
|
| 225 |
+
"model_id": "sonic-3",
|
| 226 |
+
"voice_id": "a0e99841-438c-4a64-b679-ae501e7d6091", # Kora - Japanese female
|
| 227 |
+
"language": "ja",
|
| 228 |
+
},
|
| 229 |
}
|
| 230 |
|
| 231 |
|
|
|
|
| 565 |
return f.name
|
| 566 |
|
| 567 |
|
| 568 |
+
def predict_cartesia_tts(text: str, model_id: str = "sonic-3", voice_id: str = "a0e99841-438c-4a64-b679-ae501e7d6091", language: str = "ja") -> str:
|
| 569 |
+
"""Cartesia TTS API 呼び出し (Sonic 3 model)"""
|
| 570 |
+
api_key = CARTESIA_API_KEY
|
| 571 |
+
if not api_key:
|
| 572 |
+
raise ValueError("CARTESIA_API_KEY 環境変数が設定されていません。")
|
| 573 |
+
|
| 574 |
+
url = "https://api.cartesia.ai/tts/bytes"
|
| 575 |
+
|
| 576 |
+
payload = {
|
| 577 |
+
"model_id": model_id,
|
| 578 |
+
"transcript": text,
|
| 579 |
+
"voice": {
|
| 580 |
+
"mode": "id",
|
| 581 |
+
"id": voice_id
|
| 582 |
+
},
|
| 583 |
+
"language": language,
|
| 584 |
+
"output_format": {
|
| 585 |
+
"container": "wav",
|
| 586 |
+
"encoding": "pcm_s16le",
|
| 587 |
+
"sample_rate": 24000
|
| 588 |
+
}
|
| 589 |
+
}
|
| 590 |
+
|
| 591 |
+
response = requests.post(
|
| 592 |
+
url,
|
| 593 |
+
headers={
|
| 594 |
+
"Authorization": f"Bearer {api_key}",
|
| 595 |
+
"Cartesia-Version": "2024-06-10",
|
| 596 |
+
"Content-Type": "application/json",
|
| 597 |
+
},
|
| 598 |
+
json=payload,
|
| 599 |
+
timeout=60,
|
| 600 |
+
)
|
| 601 |
+
response.raise_for_status()
|
| 602 |
+
|
| 603 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
| 604 |
+
f.write(response.content)
|
| 605 |
+
return f.name
|
| 606 |
+
|
| 607 |
+
|
| 608 |
def predict_gemini_2_5_flash_tts(text: str, voice: str = "Kore") -> str:
|
| 609 |
"""Gemini 2.5 Flash Preview TTS API 호출 (API Key 방식)"""
|
| 610 |
if not GEMINI_API_KEY:
|
|
|
|
| 741 |
audio_path = predict_narakeet_tts(text, config.get("voice", "kaori"))
|
| 742 |
is_mp3 = True # Narakeet returns MP3
|
| 743 |
|
| 744 |
+
elif provider == "cartesia":
|
| 745 |
+
audio_path = predict_cartesia_tts(
|
| 746 |
+
text,
|
| 747 |
+
config.get("model_id", "sonic-3"),
|
| 748 |
+
config.get("voice_id", "a0e99841-438c-4a64-b679-ae501e7d6091"),
|
| 749 |
+
config.get("language", "ja"),
|
| 750 |
+
)
|
| 751 |
+
# Cartesia returns WAV at 24kHz
|
| 752 |
+
|
| 753 |
else:
|
| 754 |
raise ValueError(f"不明なprovider: {provider}")
|
| 755 |
|