# pip install flask google-genai
import time
import os
from flask import Flask, request, render_template_string, Response, jsonify
from google import genai
from google.genai import types
import struct
app = Flask(__name__)
HTML = """
Gemini TTS Test
Gemini-2.5-Flash-Preview-TTS
"""
client = genai.Client(api_key=os.environ.get("GOOGLE_API_KEY", "AIzaSyDYF7OP-0P3rwLuBOVZULY1hn5HgJCcx6s"))
def wrap_pcm_to_wav(pcm_data: bytes, sample_rate: int = 24000, num_channels: int = 1, bits_per_sample: int = 16) -> bytes:
"""Wrap raw PCM bytes into WAV container."""
byte_rate = sample_rate * num_channels * bits_per_sample // 8
block_align = num_channels * bits_per_sample // 8
data_size = len(pcm_data)
fmt_chunk_size = 16
audio_format = 1 # PCM
header = b"RIFF" + struct.pack(" bytes:
style_prompt = f"Say the following text in a {accent} accent with a {tone} tone:\n\n{prompt}"
response = client.models.generate_content(
model="gemini-2.5-flash-preview-tts",
contents=[types.Content(role="user", parts=[types.Part(text=style_prompt)])],
config=types.GenerateContentConfig(
response_modalities=["AUDIO"],
speech_config=types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(
voice_name=voice
)
)
)
)
)
candidate = response.candidates[0]
part = candidate.content.parts[0]
pcm_bytes = part.inline_data.data # raw PCM from API
if not pcm_bytes:
raise RuntimeError("No audio returned from Gemini")
return wrap_pcm_to_wav(pcm_bytes)
@app.route('/')
def index():
return render_template_string(HTML)
@app.route('/generate', methods=['POST'])
def gen():
data = request.get_json(silent=True) or {}
prompt = data.get("text", "").strip()
voice = data.get("voice", "Sadachbia").strip()
accent = data.get("accent", "British").strip()
tone = data.get("tone", "casual and friendly").strip()
if not prompt:
return jsonify({"error": "No prompt provided"}), 400
try:
t0 = time.perf_counter()
wav_bytes = generate_audio_from_gemini(prompt, accent, tone, voice)
t1 = time.perf_counter()
app.logger.info(f"Gemini TTS API call took {t1 - t0:.2f}s")
return Response(wav_bytes, mimetype="audio/wav")
except Exception as e:
app.logger.exception("Generation failed")
return jsonify({"error": str(e)}), 500
if __name__ == "__main__":
port = int(os.environ.get("PORT", 7860))
app.run(host="0.0.0.0", port=port)