Ko-TTS-Arena Contributors commited on
Commit
407795c
·
1 Parent(s): 9c1f76a

feat: Add Gemini TTS (Aoede voice) - Google Cloud Text-to-Speech

Browse files
Files changed (3) hide show
  1. models.py +10 -0
  2. requirements.txt +2 -1
  3. tts.py +63 -0
models.py CHANGED
@@ -566,6 +566,7 @@ def insert_initial_models():
566
  has_clova = bool(os.getenv("CLOVA_CLIENT_ID") and os.getenv("CLOVA_API_KEY"))
567
  has_humelo = bool(os.getenv("HUMELO_API_KEY"))
568
  has_typecast = bool(os.getenv("TYPECAST_API_KEY"))
 
569
 
570
  tts_models = [
571
  # 채널톡 TTS (한국어 특화) - 항상 활성화
@@ -648,6 +649,15 @@ def insert_initial_models():
648
  is_active=has_typecast,
649
  model_url="https://typecast.ai/",
650
  ),
 
 
 
 
 
 
 
 
 
651
  ]
652
 
653
  for model in tts_models:
 
566
  has_clova = bool(os.getenv("CLOVA_CLIENT_ID") and os.getenv("CLOVA_API_KEY"))
567
  has_humelo = bool(os.getenv("HUMELO_API_KEY"))
568
  has_typecast = bool(os.getenv("TYPECAST_API_KEY"))
569
+ has_gemini_tts = bool(os.getenv("GEMINI_TTS_API_KEY"))
570
 
571
  tts_models = [
572
  # 채널톡 TTS (한국어 특화) - 항상 활성화
 
649
  is_active=has_typecast,
650
  model_url="https://typecast.ai/",
651
  ),
652
+ # Gemini TTS (Google Cloud - 다국어 지원) - API 키 있을 때만 활성화
653
+ Model(
654
+ id="gemini-tts-aoede",
655
+ name="Gemini TTS (Aoede)",
656
+ model_type=ModelType.TTS,
657
+ is_open=False,
658
+ is_active=has_gemini_tts,
659
+ model_url="https://cloud.google.com/text-to-speech/docs/gemini-tts",
660
+ ),
661
  ]
662
 
663
  for model in tts_models:
requirements.txt CHANGED
@@ -14,4 +14,5 @@ huggingface-hub
14
  scipy
15
  numpy
16
  pydub
17
- typecast-python
 
 
14
  scipy
15
  numpy
16
  pydub
17
+ typecast-python
18
+ google-cloud-texttospeech
tts.py CHANGED
@@ -55,6 +55,9 @@ HUMELO_API_URL = "https://agitvxptajouhvoatxio.supabase.co/functions/v1/dive-syn
55
  # Typecast TTS
56
  TYPECAST_API_KEY = os.getenv("TYPECAST_API_KEY")
57
 
 
 
 
58
  def resample_wav_to_16khz(input_path: str) -> str:
59
  """
60
  Resample a WAV file to 16kHz for fair comparison.
@@ -213,6 +216,12 @@ model_mapping = {
213
  "voice_id": "tc_5c789c337ad86500073a02cd",
214
  "model": "ssfm-v21",
215
  },
 
 
 
 
 
 
216
  }
217
 
218
 
@@ -438,6 +447,52 @@ def predict_typecast_tts(text: str, voice_id: str = "tc_612ed01c7eb720fddd3ddedf
438
  raise ValueError(f"Typecast TTS API 오류: {str(e)}")
439
 
440
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
441
  def predict_google_tts(text: str, voice: str = "ko-KR-Wavenet-A") -> str:
442
  """Google Cloud TTS API 호출"""
443
  api_key = os.getenv("GOOGLE_API_KEY")
@@ -538,6 +593,14 @@ def predict_tts(text: str, model: str) -> str:
538
  )
539
  # Typecast returns WAV
540
 
 
 
 
 
 
 
 
 
541
  else:
542
  raise ValueError(f"알 수 없는 provider: {provider}")
543
 
 
55
  # Typecast TTS
56
  TYPECAST_API_KEY = os.getenv("TYPECAST_API_KEY")
57
 
58
+ # Gemini TTS (Google Cloud)
59
+ GEMINI_TTS_API_KEY = os.getenv("GEMINI_TTS_API_KEY")
60
+
61
  def resample_wav_to_16khz(input_path: str) -> str:
62
  """
63
  Resample a WAV file to 16kHz for fair comparison.
 
216
  "voice_id": "tc_5c789c337ad86500073a02cd",
217
  "model": "ssfm-v21",
218
  },
219
+ # Gemini TTS (Google Cloud - 다국어 지원)
220
+ "gemini-tts-aoede": {
221
+ "provider": "gemini",
222
+ "voice": "Aoede",
223
+ "model": "gemini-2.5-flash-tts",
224
+ },
225
  }
226
 
227
 
 
447
  raise ValueError(f"Typecast TTS API 오류: {str(e)}")
448
 
449
 
450
+ def predict_gemini_tts(text: str, voice: str = "Aoede", model: str = "gemini-2.5-flash-tts") -> str:
451
+ """Gemini TTS API 호출 (Google Cloud Text-to-Speech)"""
452
+ api_key = GEMINI_TTS_API_KEY
453
+ if not api_key:
454
+ raise ValueError("GEMINI_TTS_API_KEY 환경 변수가 설정되지 않았습니다.")
455
+
456
+ try:
457
+ import os
458
+ os.environ['GOOGLE_API_KEY'] = api_key
459
+
460
+ from google.api_core.client_options import ClientOptions
461
+ from google.cloud import texttospeech_v1beta1 as texttospeech
462
+
463
+ client = texttospeech.TextToSpeechClient(
464
+ client_options=ClientOptions(api_endpoint='texttospeech.googleapis.com')
465
+ )
466
+
467
+ voice_params = texttospeech.VoiceSelectionParams(
468
+ name=voice,
469
+ language_code='ko-kr',
470
+ model_name=model
471
+ )
472
+
473
+ # Synthesize speech with natural prompt
474
+ response = client.synthesize_speech(
475
+ input=texttospeech.SynthesisInput(
476
+ text=text,
477
+ prompt='친절하고 자연스러운 톤으로 말해주세요'
478
+ ),
479
+ voice=voice_params,
480
+ audio_config=texttospeech.AudioConfig(
481
+ audio_encoding=texttospeech.AudioEncoding.LINEAR16,
482
+ sample_rate_hertz=24000
483
+ ),
484
+ )
485
+
486
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
487
+ f.write(response.audio_content)
488
+ return f.name
489
+
490
+ except ImportError:
491
+ raise ValueError("google-cloud-texttospeech 패키지가 설치되지 않았습니다.")
492
+ except Exception as e:
493
+ raise ValueError(f"Gemini TTS API 오류: {str(e)}")
494
+
495
+
496
  def predict_google_tts(text: str, voice: str = "ko-KR-Wavenet-A") -> str:
497
  """Google Cloud TTS API 호출"""
498
  api_key = os.getenv("GOOGLE_API_KEY")
 
593
  )
594
  # Typecast returns WAV
595
 
596
+ elif provider == "gemini":
597
+ audio_path = predict_gemini_tts(
598
+ text,
599
+ config.get("voice", "Aoede"),
600
+ config.get("model", "gemini-2.5-flash-tts"),
601
+ )
602
+ # Gemini TTS returns WAV at 24kHz
603
+
604
  else:
605
  raise ValueError(f"알 수 없는 provider: {provider}")
606