KO-TTS-Arena

Runtime error

App Files Files Community

Ko-TTS-Arena Contributors commited on Dec 17, 2025

Commit

14b59fc

1 Parent(s): a05825a

Add Gemini 2.5 Flash Preview TTS (API Key based)

Browse files

Files changed (2) hide show

models.py +11 -0
tts.py +95 -0

models.py CHANGED Viewed

@@ -639,6 +639,8 @@ def insert_initial_models():
     has_typecast = bool(os.getenv("TYPECAST_API_KEY"))
     # Gemini TTS는 서비스 계정 JSON이 필요 (API Key 미지원)
     has_gemini_tts = bool(os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON"))
     tts_models = [
         # 채널톡 TTS (한국어 특화) - 항상 활성화
@@ -747,6 +749,15 @@ def insert_initial_models():
             is_active=has_gemini_tts,
             model_url="https://cloud.google.com/text-to-speech/docs/gemini-tts",
         ),
     ]
     for model in tts_models:

     has_typecast = bool(os.getenv("TYPECAST_API_KEY"))
     # Gemini TTS는 서비스 계정 JSON이 필요 (API Key 미지원)
     has_gemini_tts = bool(os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON"))
+    # Gemini 2.5 Flash Preview TTS는 API Key 방식
+    has_gemini_api = bool(os.getenv("GEMINI_API_KEY"))
     tts_models = [
         # 채널톡 TTS (한국어 특화) - 항상 활성화
             is_active=has_gemini_tts,
             model_url="https://cloud.google.com/text-to-speech/docs/gemini-tts",
         ),
+        # Gemini 2.5 Flash Preview TTS (API Key 방식)
+        Model(
+            id="gemini-2.5-flash-preview-tts",
+            name="Gemini 2.5 Flash Preview TTS",
+            model_type=ModelType.TTS,
+            is_open=False,
+            is_active=has_gemini_api,
+            model_url="https://ai.google.dev/gemini-api/docs/audio",
+        ),
     ]
     for model in tts_models:

tts.py CHANGED Viewed

@@ -67,6 +67,9 @@ if GOOGLE_APPLICATION_CREDENTIALS_JSON:
     except Exception as e:
         print(f"[Gemini TTS] Failed to save credentials: {e}")
 def resample_wav_to_16khz(input_path: str) -> str:
     """
     Resample a WAV file to 16kHz for fair comparison.
@@ -247,6 +250,11 @@ model_mapping = {
         "voice": "Aoede",
         "model": "gemini-2.5-flash-tts",
     },
 }
@@ -561,6 +569,86 @@ def predict_google_tts(text: str, voice: str = "ko-KR-Wavenet-A") -> str:
         return f.name
 def predict_tts(text: str, model: str) -> str:
     """
     TTS 생성 메인 함수
@@ -635,6 +723,13 @@ def predict_tts(text: str, model: str) -> str:
         )
         # Gemini TTS returns WAV at 24kHz
     else:
         raise ValueError(f"알 수 없는 provider: {provider}")

     except Exception as e:
         print(f"[Gemini TTS] Failed to save credentials: {e}")
+# Gemini 2.5 Flash Preview TTS - API Key 방식
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 def resample_wav_to_16khz(input_path: str) -> str:
     """
     Resample a WAV file to 16kHz for fair comparison.
         "voice": "Aoede",
         "model": "gemini-2.5-flash-tts",
     },
+    # Gemini 2.5 Flash Preview TTS (API Key 방식)
+    "gemini-2.5-flash-preview-tts": {
+        "provider": "gemini-2.5-flash",
+        "voice": "Kore",
+    },
 }
         return f.name
+def predict_gemini_2_5_flash_tts(text: str, voice: str = "Kore") -> str:
+    """Gemini 2.5 Flash Preview TTS API 호출 (API Key 방식)"""
+    if not GEMINI_API_KEY:
+        raise ValueError("GEMINI_API_KEY 환경 변수가 설정되지 않았습니다.")
+    url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent"
+    response = requests.post(
+        url,
+        headers={
+            "x-goog-api-key": GEMINI_API_KEY,
+            "Content-Type": "application/json",
+        },
+        json={
+            "contents": [{
+                "parts": [{
+                    "text": text
+                }]
+            }],
+            "generationConfig": {
+                "responseModalities": ["AUDIO"],
+                "speechConfig": {
+                    "voiceConfig": {
+                        "prebuiltVoiceConfig": {
+                            "voiceName": voice
+                        }
+                    }
+                }
+            },
+            "model": "gemini-2.5-flash-preview-tts",
+        },
+        timeout=60,
+    )
+    response.raise_for_status()
+    result = response.json()
+    # Extract base64 audio data
+    try:
+        audio_data_b64 = result["candidates"][0]["content"]["parts"][0]["inlineData"]["data"]
+    except (KeyError, IndexError) as e:
+        raise ValueError(f"Gemini 2.5 Flash TTS API가 예상한 형식의 응답을 반환하지 않았습니다: {e}")
+    # Decode base64 to PCM
+    pcm_bytes = base64.b64decode(audio_data_b64)
+    # Save PCM temporarily
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".pcm") as pcm_file:
+        pcm_file.write(pcm_bytes)
+        pcm_path = pcm_file.name
+    # Convert PCM to WAV using ffmpeg
+    try:
+        import subprocess
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as wav_file:
+            wav_path = wav_file.name
+        # PCM format: s16le (signed 16-bit little-endian), 24kHz, mono
+        subprocess.run([
+            "ffmpeg", "-y",
+            "-f", "s16le",
+            "-ar", "24000",
+            "-ac", "1",
+            "-i", pcm_path,
+            wav_path
+        ], check=True, capture_output=True)
+        # Clean up PCM file
+        os.remove(pcm_path)
+        return wav_path
+    except Exception as e:
+        # Clean up on error
+        if os.path.exists(pcm_path):
+            os.remove(pcm_path)
+        raise ValueError(f"PCM to WAV 변환 실패: {str(e)}")
 def predict_tts(text: str, model: str) -> str:
     """
     TTS 생성 메인 함수
         )
         # Gemini TTS returns WAV at 24kHz
+    elif provider == "gemini-2.5-flash":
+        audio_path = predict_gemini_2_5_flash_tts(
+            text,
+            config.get("voice", "Kore"),
+        )
+        # Gemini 2.5 Flash TTS returns WAV at 24kHz (converted from PCM)
     else:
         raise ValueError(f"알 수 없는 provider: {provider}")