whung99
feat: deploy Oppy with Google API integration
0d37119
import io
import os
import struct
from google import genai
from google.genai import types
def _make_wav(pcm_data: bytes, sample_rate: int = 24000, channels: int = 1, bits_per_sample: int = 16) -> bytes:
"""Wrap raw PCM bytes in a WAV container."""
data_size = len(pcm_data)
byte_rate = sample_rate * channels * bits_per_sample // 8
block_align = channels * bits_per_sample // 8
buf = io.BytesIO()
# RIFF header
buf.write(b"RIFF")
buf.write(struct.pack("<I", 36 + data_size))
buf.write(b"WAVE")
# fmt chunk
buf.write(b"fmt ")
buf.write(struct.pack("<I", 16)) # chunk size
buf.write(struct.pack("<H", 1)) # PCM format
buf.write(struct.pack("<H", channels))
buf.write(struct.pack("<I", sample_rate))
buf.write(struct.pack("<I", byte_rate))
buf.write(struct.pack("<H", block_align))
buf.write(struct.pack("<H", bits_per_sample))
# data chunk
buf.write(b"data")
buf.write(struct.pack("<I", data_size))
buf.write(pcm_data)
return buf.getvalue()
def generate_speech(text: str) -> bytes:
"""Generate speech audio from text using Gemini TTS.
Args:
text: The text to convert to speech.
Returns:
WAV audio bytes ready to play in a browser.
"""
client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
response = client.models.generate_content(
model="gemini-2.5-flash-preview-tts",
contents=text,
config=types.GenerateContentConfig(
response_modalities=["AUDIO"],
speech_config=types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(
voice_name="Kore",
)
),
),
),
)
audio_data = response.candidates[0].content.parts[0].inline_data.data
return _make_wav(audio_data)