Spaces:
Sleeping
Sleeping
| import io | |
| import os | |
| import struct | |
| from google import genai | |
| from google.genai import types | |
| def _make_wav(pcm_data: bytes, sample_rate: int = 24000, channels: int = 1, bits_per_sample: int = 16) -> bytes: | |
| """Wrap raw PCM bytes in a WAV container.""" | |
| data_size = len(pcm_data) | |
| byte_rate = sample_rate * channels * bits_per_sample // 8 | |
| block_align = channels * bits_per_sample // 8 | |
| buf = io.BytesIO() | |
| # RIFF header | |
| buf.write(b"RIFF") | |
| buf.write(struct.pack("<I", 36 + data_size)) | |
| buf.write(b"WAVE") | |
| # fmt chunk | |
| buf.write(b"fmt ") | |
| buf.write(struct.pack("<I", 16)) # chunk size | |
| buf.write(struct.pack("<H", 1)) # PCM format | |
| buf.write(struct.pack("<H", channels)) | |
| buf.write(struct.pack("<I", sample_rate)) | |
| buf.write(struct.pack("<I", byte_rate)) | |
| buf.write(struct.pack("<H", block_align)) | |
| buf.write(struct.pack("<H", bits_per_sample)) | |
| # data chunk | |
| buf.write(b"data") | |
| buf.write(struct.pack("<I", data_size)) | |
| buf.write(pcm_data) | |
| return buf.getvalue() | |
| def generate_speech(text: str) -> bytes: | |
| """Generate speech audio from text using Gemini TTS. | |
| Args: | |
| text: The text to convert to speech. | |
| Returns: | |
| WAV audio bytes ready to play in a browser. | |
| """ | |
| client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY")) | |
| response = client.models.generate_content( | |
| model="gemini-2.5-flash-preview-tts", | |
| contents=text, | |
| config=types.GenerateContentConfig( | |
| response_modalities=["AUDIO"], | |
| speech_config=types.SpeechConfig( | |
| voice_config=types.VoiceConfig( | |
| prebuilt_voice_config=types.PrebuiltVoiceConfig( | |
| voice_name="Kore", | |
| ) | |
| ), | |
| ), | |
| ), | |
| ) | |
| audio_data = response.candidates[0].content.parts[0].inline_data.data | |
| return _make_wav(audio_data) | |