Ko-TTS-Arena Contributors commited on
Commit
c571e18
·
1 Parent(s): ee853d8

Add Cartesia Sonic 3 TTS with Japanese Kora voice

Browse files
Files changed (2) hide show
  1. models.py +10 -0
  2. tts.py +59 -0
models.py CHANGED
@@ -637,6 +637,7 @@ def insert_initial_models():
637
  has_typecast = bool(os.getenv("TYPECAST_API_KEY"))
638
  has_clova = bool(os.getenv("CLOVA_CLIENT_ID")) and bool(os.getenv("CLOVA_API_KEY"))
639
  has_narakeet = bool(os.getenv("NARAKEET_API_KEY"))
 
640
 
641
  tts_models = [
642
  # ElevenLabs (多言語対応) - API キーがある時のみ活性化
@@ -693,6 +694,15 @@ def insert_initial_models():
693
  is_active=has_narakeet,
694
  model_url="https://www.narakeet.com/",
695
  ),
 
 
 
 
 
 
 
 
 
696
  ]
697
 
698
  for model in tts_models:
 
637
  has_typecast = bool(os.getenv("TYPECAST_API_KEY"))
638
  has_clova = bool(os.getenv("CLOVA_CLIENT_ID")) and bool(os.getenv("CLOVA_API_KEY"))
639
  has_narakeet = bool(os.getenv("NARAKEET_API_KEY"))
640
+ has_cartesia = bool(os.getenv("CARTESIA_API_KEY"))
641
 
642
  tts_models = [
643
  # ElevenLabs (多言語対応) - API キーがある時のみ活性化
 
694
  is_active=has_narakeet,
695
  model_url="https://www.narakeet.com/",
696
  ),
697
+ # Cartesia TTS (日本語対応) - Sonic 3 Kora voice
698
+ Model(
699
+ id="cartesia-sonic3",
700
+ name="Cartesia Sonic 3",
701
+ model_type=ModelType.TTS,
702
+ is_open=False,
703
+ is_active=has_cartesia,
704
+ model_url="https://cartesia.ai/",
705
+ ),
706
  ]
707
 
708
  for model in tts_models:
tts.py CHANGED
@@ -58,6 +58,9 @@ TYPECAST_API_KEY = os.getenv("TYPECAST_API_KEY")
58
  # Narakeet TTS
59
  NARAKEET_API_KEY = os.getenv("NARAKEET_API_KEY")
60
 
 
 
 
61
  # Gemini TTS (Google Cloud) - 서비스 계정 JSON 필요 (API Key 미지원)
62
  GOOGLE_APPLICATION_CREDENTIALS_JSON = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
63
  if GOOGLE_APPLICATION_CREDENTIALS_JSON:
@@ -216,6 +219,13 @@ model_mapping = {
216
  "provider": "narakeet",
217
  "voice": "kaori", # 日本語女性音声
218
  },
 
 
 
 
 
 
 
219
  }
220
 
221
 
@@ -555,6 +565,46 @@ def predict_narakeet_tts(text: str, voice: str = "kaori") -> str:
555
  return f.name
556
 
557
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
558
  def predict_gemini_2_5_flash_tts(text: str, voice: str = "Kore") -> str:
559
  """Gemini 2.5 Flash Preview TTS API 호출 (API Key 방식)"""
560
  if not GEMINI_API_KEY:
@@ -691,6 +741,15 @@ def predict_tts(text: str, model: str) -> str:
691
  audio_path = predict_narakeet_tts(text, config.get("voice", "kaori"))
692
  is_mp3 = True # Narakeet returns MP3
693
 
 
 
 
 
 
 
 
 
 
694
  else:
695
  raise ValueError(f"不明なprovider: {provider}")
696
 
 
58
  # Narakeet TTS
59
  NARAKEET_API_KEY = os.getenv("NARAKEET_API_KEY")
60
 
61
+ # Cartesia TTS
62
+ CARTESIA_API_KEY = os.getenv("CARTESIA_API_KEY")
63
+
64
  # Gemini TTS (Google Cloud) - 서비스 계정 JSON 필요 (API Key 미지원)
65
  GOOGLE_APPLICATION_CREDENTIALS_JSON = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
66
  if GOOGLE_APPLICATION_CREDENTIALS_JSON:
 
219
  "provider": "narakeet",
220
  "voice": "kaori", # 日本語女性音声
221
  },
222
+ # Cartesia TTS (日本語対応) - Sonic 3 model
223
+ "cartesia-sonic3": {
224
+ "provider": "cartesia",
225
+ "model_id": "sonic-3",
226
+ "voice_id": "a0e99841-438c-4a64-b679-ae501e7d6091", # Kora - Japanese female
227
+ "language": "ja",
228
+ },
229
  }
230
 
231
 
 
565
  return f.name
566
 
567
 
568
+ def predict_cartesia_tts(text: str, model_id: str = "sonic-3", voice_id: str = "a0e99841-438c-4a64-b679-ae501e7d6091", language: str = "ja") -> str:
569
+ """Cartesia TTS API 呼び出し (Sonic 3 model)"""
570
+ api_key = CARTESIA_API_KEY
571
+ if not api_key:
572
+ raise ValueError("CARTESIA_API_KEY 環境変数が設定されていません。")
573
+
574
+ url = "https://api.cartesia.ai/tts/bytes"
575
+
576
+ payload = {
577
+ "model_id": model_id,
578
+ "transcript": text,
579
+ "voice": {
580
+ "mode": "id",
581
+ "id": voice_id
582
+ },
583
+ "language": language,
584
+ "output_format": {
585
+ "container": "wav",
586
+ "encoding": "pcm_s16le",
587
+ "sample_rate": 24000
588
+ }
589
+ }
590
+
591
+ response = requests.post(
592
+ url,
593
+ headers={
594
+ "Authorization": f"Bearer {api_key}",
595
+ "Cartesia-Version": "2024-06-10",
596
+ "Content-Type": "application/json",
597
+ },
598
+ json=payload,
599
+ timeout=60,
600
+ )
601
+ response.raise_for_status()
602
+
603
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
604
+ f.write(response.content)
605
+ return f.name
606
+
607
+
608
  def predict_gemini_2_5_flash_tts(text: str, voice: str = "Kore") -> str:
609
  """Gemini 2.5 Flash Preview TTS API 호출 (API Key 방식)"""
610
  if not GEMINI_API_KEY:
 
741
  audio_path = predict_narakeet_tts(text, config.get("voice", "kaori"))
742
  is_mp3 = True # Narakeet returns MP3
743
 
744
+ elif provider == "cartesia":
745
+ audio_path = predict_cartesia_tts(
746
+ text,
747
+ config.get("model_id", "sonic-3"),
748
+ config.get("voice_id", "a0e99841-438c-4a64-b679-ae501e7d6091"),
749
+ config.get("language", "ja"),
750
+ )
751
+ # Cartesia returns WAV at 24kHz
752
+
753
  else:
754
  raise ValueError(f"不明なprovider: {provider}")
755