File size: 4,807 Bytes
0998987 d868c8e 0998987 766b0e7 0998987 766b0e7 07221e5 0998987 e285d1f 0998987 f7336ac 0998987 766b0e7 0998987 f7336ac 85e052a f7336ac 0998987 f7336ac 0998987 c6d4440 8b8432d 85e052a f7336ac c6d4440 8b8432d c6d4440 766b0e7 c6d4440 f7336ac c6d4440 0998987 85e052a 766b0e7 85e052a 58eca8a 85e052a 766b0e7 f7336ac 58eca8a 5c6206c 766b0e7 85e052a 766b0e7 5c6206c 1351e15 f7336ac 766b0e7 f7336ac 766b0e7 f7336ac 0998987 c6d4440 8b8432d 85e052a 0998987 e285d1f d868c8e 85e052a d868c8e 766b0e7 0998987 f7336ac 0998987 f7336ac |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
# pip install flask google-genai
import time
import os
from flask import Flask, request, render_template_string, Response, jsonify
from google import genai
from google.genai import types
import struct
app = Flask(__name__)
HTML = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<title>Gemini TTS Test</title>
</head>
<body style="font-family:sans-serif;padding:2rem;">
<h1>Gemini-2.5-Flash-Preview-TTS</h1>
<form id="genai-form">
<textarea id="prompt" rows="6" cols="60" placeholder="Enter text to synthesize"></textarea><br/><br/>
<label>Voice: <input id="voice" value="Sadachbia" /></label><br/>
<label>Accent: <input id="accent" value="British" /></label><br/>
<label>Tone: <input id="tone" value="casual and friendly" /></label><br/><br/>
<button type="submit">Generate</button>
</form>
<div id="output" style="margin-top:1rem;"></div>
<script>
const form = document.getElementById('genai-form');
form.addEventListener('submit', async e => {
e.preventDefault();
const text = document.getElementById('prompt').value.trim();
const voice = document.getElementById('voice').value.trim();
const accent = document.getElementById('accent').value.trim();
const tone = document.getElementById('tone').value.trim();
const out = document.getElementById('output');
out.textContent = 'Generating…';
try {
const resp = await fetch('/generate', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ text, voice, accent, tone }),
});
if (!resp.ok) {
const errText = await resp.text();
throw new Error(`Server returned ${resp.status}: ${errText}`);
}
const blob = await resp.blob();
const url = URL.createObjectURL(blob);
out.innerHTML = '<audio controls src="' + url + '"></audio>';
} catch (err) {
console.error(err);
out.textContent = 'Fetch error: ' + err.message;
}
});
</script>
</body>
</html>
"""
client = genai.Client(api_key=os.environ.get("GOOGLE_API_KEY", "AIzaSyDYF7OP-0P3rwLuBOVZULY1hn5HgJCcx6s"))
def wrap_pcm_to_wav(pcm_data: bytes, sample_rate: int = 24000, num_channels: int = 1, bits_per_sample: int = 16) -> bytes:
"""Wrap raw PCM bytes into WAV container."""
byte_rate = sample_rate * num_channels * bits_per_sample // 8
block_align = num_channels * bits_per_sample // 8
data_size = len(pcm_data)
fmt_chunk_size = 16
audio_format = 1 # PCM
header = b"RIFF" + struct.pack("<I", 36 + data_size) + b"WAVE"
header += b"fmt " + struct.pack("<IHHIIHH",
fmt_chunk_size, audio_format, num_channels,
sample_rate, byte_rate, block_align, bits_per_sample
)
header += b"data" + struct.pack("<I", data_size)
return header + pcm_data
def generate_audio_from_gemini(prompt: str, accent: str, tone: str, voice: str) -> bytes:
style_prompt = f"Say the following text in a {accent} accent with a {tone} tone:\n\n{prompt}"
response = client.models.generate_content(
model="gemini-2.5-flash-preview-tts",
contents=[types.Content(role="user", parts=[types.Part(text=style_prompt)])],
config=types.GenerateContentConfig(
response_modalities=["AUDIO"],
speech_config=types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(
voice_name=voice
)
)
)
)
)
candidate = response.candidates[0]
part = candidate.content.parts[0]
pcm_bytes = part.inline_data.data # raw PCM from API
if not pcm_bytes:
raise RuntimeError("No audio returned from Gemini")
return wrap_pcm_to_wav(pcm_bytes)
@app.route('/')
def index():
return render_template_string(HTML)
@app.route('/generate', methods=['POST'])
def gen():
data = request.get_json(silent=True) or {}
prompt = data.get("text", "").strip()
voice = data.get("voice", "Sadachbia").strip()
accent = data.get("accent", "British").strip()
tone = data.get("tone", "casual and friendly").strip()
if not prompt:
return jsonify({"error": "No prompt provided"}), 400
try:
t0 = time.perf_counter()
wav_bytes = generate_audio_from_gemini(prompt, accent, tone, voice)
t1 = time.perf_counter()
app.logger.info(f"Gemini TTS API call took {t1 - t0:.2f}s")
return Response(wav_bytes, mimetype="audio/wav")
except Exception as e:
app.logger.exception("Generation failed")
return jsonify({"error": str(e)}), 500
if __name__ == "__main__":
port = int(os.environ.get("PORT", 7860))
app.run(host="0.0.0.0", port=port) |