Ko-TTS-Arena Contributors commited on
Commit
14b59fc
·
1 Parent(s): a05825a

Add Gemini 2.5 Flash Preview TTS (API Key based)

Browse files
Files changed (2) hide show
  1. models.py +11 -0
  2. tts.py +95 -0
models.py CHANGED
@@ -639,6 +639,8 @@ def insert_initial_models():
639
  has_typecast = bool(os.getenv("TYPECAST_API_KEY"))
640
  # Gemini TTS는 서비스 계정 JSON이 필요 (API Key 미지원)
641
  has_gemini_tts = bool(os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON"))
 
 
642
 
643
  tts_models = [
644
  # 채널톡 TTS (한국어 특화) - 항상 활성화
@@ -747,6 +749,15 @@ def insert_initial_models():
747
  is_active=has_gemini_tts,
748
  model_url="https://cloud.google.com/text-to-speech/docs/gemini-tts",
749
  ),
 
 
 
 
 
 
 
 
 
750
  ]
751
 
752
  for model in tts_models:
 
639
  has_typecast = bool(os.getenv("TYPECAST_API_KEY"))
640
  # Gemini TTS는 서비스 계정 JSON이 필요 (API Key 미지원)
641
  has_gemini_tts = bool(os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON"))
642
+ # Gemini 2.5 Flash Preview TTS는 API Key 방식
643
+ has_gemini_api = bool(os.getenv("GEMINI_API_KEY"))
644
 
645
  tts_models = [
646
  # 채널톡 TTS (한국어 특화) - 항상 활성화
 
749
  is_active=has_gemini_tts,
750
  model_url="https://cloud.google.com/text-to-speech/docs/gemini-tts",
751
  ),
752
+ # Gemini 2.5 Flash Preview TTS (API Key 방식)
753
+ Model(
754
+ id="gemini-2.5-flash-preview-tts",
755
+ name="Gemini 2.5 Flash Preview TTS",
756
+ model_type=ModelType.TTS,
757
+ is_open=False,
758
+ is_active=has_gemini_api,
759
+ model_url="https://ai.google.dev/gemini-api/docs/audio",
760
+ ),
761
  ]
762
 
763
  for model in tts_models:
tts.py CHANGED
@@ -67,6 +67,9 @@ if GOOGLE_APPLICATION_CREDENTIALS_JSON:
67
  except Exception as e:
68
  print(f"[Gemini TTS] Failed to save credentials: {e}")
69
 
 
 
 
70
  def resample_wav_to_16khz(input_path: str) -> str:
71
  """
72
  Resample a WAV file to 16kHz for fair comparison.
@@ -247,6 +250,11 @@ model_mapping = {
247
  "voice": "Aoede",
248
  "model": "gemini-2.5-flash-tts",
249
  },
 
 
 
 
 
250
  }
251
 
252
 
@@ -561,6 +569,86 @@ def predict_google_tts(text: str, voice: str = "ko-KR-Wavenet-A") -> str:
561
  return f.name
562
 
563
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
564
  def predict_tts(text: str, model: str) -> str:
565
  """
566
  TTS 생성 메인 함수
@@ -635,6 +723,13 @@ def predict_tts(text: str, model: str) -> str:
635
  )
636
  # Gemini TTS returns WAV at 24kHz
637
 
 
 
 
 
 
 
 
638
  else:
639
  raise ValueError(f"알 수 없는 provider: {provider}")
640
 
 
67
  except Exception as e:
68
  print(f"[Gemini TTS] Failed to save credentials: {e}")
69
 
70
+ # Gemini 2.5 Flash Preview TTS - API Key 방식
71
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
72
+
73
  def resample_wav_to_16khz(input_path: str) -> str:
74
  """
75
  Resample a WAV file to 16kHz for fair comparison.
 
250
  "voice": "Aoede",
251
  "model": "gemini-2.5-flash-tts",
252
  },
253
+ # Gemini 2.5 Flash Preview TTS (API Key 방식)
254
+ "gemini-2.5-flash-preview-tts": {
255
+ "provider": "gemini-2.5-flash",
256
+ "voice": "Kore",
257
+ },
258
  }
259
 
260
 
 
569
  return f.name
570
 
571
 
572
+ def predict_gemini_2_5_flash_tts(text: str, voice: str = "Kore") -> str:
573
+ """Gemini 2.5 Flash Preview TTS API 호출 (API Key 방식)"""
574
+ if not GEMINI_API_KEY:
575
+ raise ValueError("GEMINI_API_KEY 환경 변수가 설정되지 않았습니다.")
576
+
577
+ url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent"
578
+
579
+ response = requests.post(
580
+ url,
581
+ headers={
582
+ "x-goog-api-key": GEMINI_API_KEY,
583
+ "Content-Type": "application/json",
584
+ },
585
+ json={
586
+ "contents": [{
587
+ "parts": [{
588
+ "text": text
589
+ }]
590
+ }],
591
+ "generationConfig": {
592
+ "responseModalities": ["AUDIO"],
593
+ "speechConfig": {
594
+ "voiceConfig": {
595
+ "prebuiltVoiceConfig": {
596
+ "voiceName": voice
597
+ }
598
+ }
599
+ }
600
+ },
601
+ "model": "gemini-2.5-flash-preview-tts",
602
+ },
603
+ timeout=60,
604
+ )
605
+ response.raise_for_status()
606
+
607
+ result = response.json()
608
+
609
+ # Extract base64 audio data
610
+ try:
611
+ audio_data_b64 = result["candidates"][0]["content"]["parts"][0]["inlineData"]["data"]
612
+ except (KeyError, IndexError) as e:
613
+ raise ValueError(f"Gemini 2.5 Flash TTS API가 예상한 형식의 응답을 반환하지 않았습니다: {e}")
614
+
615
+ # Decode base64 to PCM
616
+ pcm_bytes = base64.b64decode(audio_data_b64)
617
+
618
+ # Save PCM temporarily
619
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pcm") as pcm_file:
620
+ pcm_file.write(pcm_bytes)
621
+ pcm_path = pcm_file.name
622
+
623
+ # Convert PCM to WAV using ffmpeg
624
+ try:
625
+ import subprocess
626
+
627
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as wav_file:
628
+ wav_path = wav_file.name
629
+
630
+ # PCM format: s16le (signed 16-bit little-endian), 24kHz, mono
631
+ subprocess.run([
632
+ "ffmpeg", "-y",
633
+ "-f", "s16le",
634
+ "-ar", "24000",
635
+ "-ac", "1",
636
+ "-i", pcm_path,
637
+ wav_path
638
+ ], check=True, capture_output=True)
639
+
640
+ # Clean up PCM file
641
+ os.remove(pcm_path)
642
+
643
+ return wav_path
644
+
645
+ except Exception as e:
646
+ # Clean up on error
647
+ if os.path.exists(pcm_path):
648
+ os.remove(pcm_path)
649
+ raise ValueError(f"PCM to WAV 변환 실패: {str(e)}")
650
+
651
+
652
  def predict_tts(text: str, model: str) -> str:
653
  """
654
  TTS 생성 메인 함수
 
723
  )
724
  # Gemini TTS returns WAV at 24kHz
725
 
726
+ elif provider == "gemini-2.5-flash":
727
+ audio_path = predict_gemini_2_5_flash_tts(
728
+ text,
729
+ config.get("voice", "Kore"),
730
+ )
731
+ # Gemini 2.5 Flash TTS returns WAV at 24kHz (converted from PCM)
732
+
733
  else:
734
  raise ValueError(f"알 수 없는 provider: {provider}")
735