Ko-TTS-Arena Contributors commited on
Commit
b0bdfc9
·
1 Parent(s): 407795c

fix: Add ffmpeg to Docker, disable Gemini TTS (requires OAuth2), keep REST API code for future

Browse files
Files changed (4) hide show
  1. Dockerfile +1 -0
  2. models.py +9 -9
  3. requirements.txt +1 -2
  4. tts.py +31 -28
Dockerfile CHANGED
@@ -6,6 +6,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
6
  wget \
7
  curl \
8
  git \
 
9
  && rm -rf /var/lib/apt/lists/*
10
 
11
  # Create non-root user
 
6
  wget \
7
  curl \
8
  git \
9
+ ffmpeg \
10
  && rm -rf /var/lib/apt/lists/*
11
 
12
  # Create non-root user
models.py CHANGED
@@ -649,15 +649,15 @@ def insert_initial_models():
649
  is_active=has_typecast,
650
  model_url="https://typecast.ai/",
651
  ),
652
- # Gemini TTS (Google Cloud - 다국어 지원) - API 있을 때만 활성화
653
- Model(
654
- id="gemini-tts-aoede",
655
- name="Gemini TTS (Aoede)",
656
- model_type=ModelType.TTS,
657
- is_open=False,
658
- is_active=has_gemini_tts,
659
- model_url="https://cloud.google.com/text-to-speech/docs/gemini-tts",
660
- ),
661
  ]
662
 
663
  for model in tts_models:
 
649
  is_active=has_typecast,
650
  model_url="https://typecast.ai/",
651
  ),
652
+ # Gemini TTS (Google Cloud - 다국어 지원) - OAuth2 인증 필요, 현재 비활성화
653
+ # Model(
654
+ # id="gemini-tts-aoede",
655
+ # name="Gemini TTS (Aoede)",
656
+ # model_type=ModelType.TTS,
657
+ # is_open=False,
658
+ # is_active=has_gemini_tts,
659
+ # model_url="https://cloud.google.com/text-to-speech/docs/gemini-tts",
660
+ # ),
661
  ]
662
 
663
  for model in tts_models:
requirements.txt CHANGED
@@ -14,5 +14,4 @@ huggingface-hub
14
  scipy
15
  numpy
16
  pydub
17
- typecast-python
18
- google-cloud-texttospeech
 
14
  scipy
15
  numpy
16
  pydub
17
+ typecast-python
 
tts.py CHANGED
@@ -448,47 +448,50 @@ def predict_typecast_tts(text: str, voice_id: str = "tc_612ed01c7eb720fddd3ddedf
448
 
449
 
450
  def predict_gemini_tts(text: str, voice: str = "Aoede", model: str = "gemini-2.5-flash-tts") -> str:
451
- """Gemini TTS API 호출 (Google Cloud Text-to-Speech)"""
452
  api_key = GEMINI_TTS_API_KEY
453
  if not api_key:
454
  raise ValueError("GEMINI_TTS_API_KEY 환경 변수가 설정되지 않았습니다.")
455
 
456
  try:
457
- import os
458
- os.environ['GOOGLE_API_KEY'] = api_key
459
 
460
- from google.api_core.client_options import ClientOptions
461
- from google.cloud import texttospeech_v1beta1 as texttospeech
462
-
463
- client = texttospeech.TextToSpeechClient(
464
- client_options=ClientOptions(api_endpoint='texttospeech.googleapis.com')
465
- )
 
 
 
 
 
 
 
 
 
466
 
467
- voice_params = texttospeech.VoiceSelectionParams(
468
- name=voice,
469
- language_code='ko-kr',
470
- model_name=model
 
471
  )
 
472
 
473
- # Synthesize speech with natural prompt
474
- response = client.synthesize_speech(
475
- input=texttospeech.SynthesisInput(
476
- text=text,
477
- prompt='친절하고 자연스러운 톤으로 말해주세요'
478
- ),
479
- voice=voice_params,
480
- audio_config=texttospeech.AudioConfig(
481
- audio_encoding=texttospeech.AudioEncoding.LINEAR16,
482
- sample_rate_hertz=24000
483
- ),
484
- )
485
 
 
486
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
487
- f.write(response.audio_content)
488
  return f.name
489
 
490
- except ImportError:
491
- raise ValueError("google-cloud-texttospeech 패키지가 설치되지 않았습니다.")
492
  except Exception as e:
493
  raise ValueError(f"Gemini TTS API 오류: {str(e)}")
494
 
 
448
 
449
 
450
  def predict_gemini_tts(text: str, voice: str = "Aoede", model: str = "gemini-2.5-flash-tts") -> str:
451
+ """Gemini TTS API 호출 (REST API 방식)"""
452
  api_key = GEMINI_TTS_API_KEY
453
  if not api_key:
454
  raise ValueError("GEMINI_TTS_API_KEY 환경 변수가 설정되지 않았습니다.")
455
 
456
  try:
457
+ # REST API 엔드포인트
458
+ url = f"https://texttospeech.googleapis.com/v1beta1/text:synthesize?key={api_key}"
459
 
460
+ payload = {
461
+ "input": {
462
+ "text": text,
463
+ "prompt": "친절하고 자연스러운 톤으로 말해주세요"
464
+ },
465
+ "voice": {
466
+ "languageCode": "ko-kr",
467
+ "name": voice,
468
+ "modelName": model
469
+ },
470
+ "audioConfig": {
471
+ "audioEncoding": "LINEAR16",
472
+ "sampleRateHertz": 24000
473
+ }
474
+ }
475
 
476
+ response = requests.post(
477
+ url,
478
+ headers={"Content-Type": "application/json"},
479
+ json=payload,
480
+ timeout=60
481
  )
482
+ response.raise_for_status()
483
 
484
+ audio_content = response.json().get("audioContent")
485
+ if not audio_content:
486
+ raise ValueError("Gemini TTS API가 오디오를 반환하지 않았습니다.")
 
 
 
 
 
 
 
 
 
487
 
488
+ audio_bytes = base64.b64decode(audio_content)
489
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
490
+ f.write(audio_bytes)
491
  return f.name
492
 
493
+ except requests.exceptions.RequestException as e:
494
+ raise ValueError(f"Gemini TTS API 요청 오류: {str(e)}")
495
  except Exception as e:
496
  raise ValueError(f"Gemini TTS API 오류: {str(e)}")
497