Pepguy commited on
Commit
0ef208e
·
verified ·
1 Parent(s): 83daf1c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +190 -33
app.py CHANGED
@@ -1,6 +1,6 @@
1
  # pip install flask google-genai
2
  import os, time, base64, struct
3
- from flask import Flask, request, render_template_string, jsonify
4
  from google import genai
5
  from google.genai import types
6
 
@@ -9,9 +9,9 @@ app = Flask(__name__)
9
  HTML = """
10
  <!DOCTYPE html>
11
  <html>
12
- <head><meta charset="UTF-8"><title>Gemini Multi (Text → Styled TTS)</title></head>
13
  <body style="font-family:sans-serif;padding:2rem;">
14
- <h1>Gemini Multi (Text + Image → Styled TTS)</h1>
15
  <form id="genai-form" enctype="multipart/form-data">
16
  <textarea id="prompt" name="text" rows="6" cols="60" placeholder="Enter prompt"></textarea><br/><br/>
17
  <input type="file" id="image" name="image" accept="image/*" /><br/><br/>
@@ -23,53 +23,130 @@ HTML = """
23
 
24
  <pre id="output" style="background:#f4f4f4;padding:1rem;margin-top:1rem;"></pre>
25
  <div id="audio-out" style="margin-top:1rem;"></div>
 
26
 
27
  <script>
28
  const form = document.getElementById('genai-form');
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  form.addEventListener('submit', async e => {
30
  e.preventDefault();
31
  const out = document.getElementById('output');
32
  const audioDiv = document.getElementById('audio-out');
33
- out.textContent = 'Generating…';
 
 
34
  audioDiv.innerHTML = '';
 
 
 
 
 
 
35
 
36
  const formData = new FormData(form);
 
37
  try {
38
- const resp = await fetch('/generate', { method: 'POST', body: formData });
39
- const data = await resp.json();
40
-
41
  if (!resp.ok) {
42
- out.textContent = 'Server error: ' + (data && data.error ? data.error : resp.statusText);
43
  return;
44
  }
45
 
46
- if (data.error) {
47
- out.textContent = 'Error: ' + data.error;
48
- return;
49
- }
 
50
 
51
- if (data.timings) {
52
- out.textContent =
53
- 'text_seconds: ' + data.timings.text_seconds +
54
- ', tts_seconds: ' + data.timings.tts_seconds +
55
- ', total_seconds: ' + data.timings.total_seconds +
56
- '\\n\\n' + (data.text || "(no text)");
57
- } else {
58
- out.textContent = data.text || "(no text)";
59
- }
60
 
61
- if (data.audio_base64) {
62
- const audio = document.createElement('audio');
63
- audio.controls = true;
64
- audio.src = "data:audio/wav;base64," + data.audio_base64;
65
- audioDiv.appendChild(audio);
66
- try { audio.play().catch(()=>{}); } catch(e){}
67
- } else {
68
- audioDiv.textContent = 'No audio returned';
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  }
70
  } catch (err) {
71
  console.error(err);
72
  out.textContent = 'Fetch error: ' + err.message;
 
73
  }
74
  });
75
  </script>
@@ -77,7 +154,7 @@ HTML = """
77
  </html>
78
  """
79
 
80
- client = genai.Client(api_key="AIzaSyDolbPUZBPUPvQUu-RGktJmvnUpkcEKIYo",)
81
 
82
  def wrap_pcm_to_wav(pcm_data: bytes, sample_rate=24000, num_channels=1, bits_per_sample=16) -> bytes:
83
  byte_rate = sample_rate * num_channels * bits_per_sample // 8
@@ -103,6 +180,89 @@ def extract_text(resp) -> str:
103
  def index():
104
  return render_template_string(HTML)
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  @app.route('/generate', methods=['POST'])
107
  def generate():
108
  t_start = time.perf_counter()
@@ -115,14 +275,12 @@ def generate():
115
  if not prompt and not file:
116
  return jsonify({"error": "No input provided"}), 400
117
 
118
- # parts for multimodal input
119
  parts = []
120
  if prompt:
121
  parts.append(types.Part.from_text(text=prompt))
122
  if file:
123
  parts.append(types.Part.from_bytes(data=file.read(), mime_type=file.mimetype or "image/png"))
124
 
125
- # 1) Generate text from multimodal input
126
  t0 = time.perf_counter()
127
  try:
128
  gen_resp = client.models.generate_content(
@@ -138,7 +296,6 @@ def generate():
138
  if not final_text:
139
  return jsonify({"error": "Text generation returned empty"}), 500
140
 
141
- # 2) Voice-prompted TTS
142
  style_prompt = f"Say the following in a {accent} accent with a {tone} tone:\n\n{final_text}"
143
  tts_start = time.perf_counter()
144
  try:
 
1
  # pip install flask google-genai
2
  import os, time, base64, struct
3
+ from flask import Flask, request, render_template_string, jsonify, Response, stream_with_context
4
  from google import genai
5
  from google.genai import types
6
 
 
9
  HTML = """
10
  <!DOCTYPE html>
11
  <html>
12
+ <head><meta charset="UTF-8"><title>Gemini Multi (Text → Streaming TTS)</title></head>
13
  <body style="font-family:sans-serif;padding:2rem;">
14
+ <h1>Gemini Multi (Text + Image → Streaming TTS)</h1>
15
  <form id="genai-form" enctype="multipart/form-data">
16
  <textarea id="prompt" name="text" rows="6" cols="60" placeholder="Enter prompt"></textarea><br/><br/>
17
  <input type="file" id="image" name="image" accept="image/*" /><br/><br/>
 
23
 
24
  <pre id="output" style="background:#f4f4f4;padding:1rem;margin-top:1rem;"></pre>
25
  <div id="audio-out" style="margin-top:1rem;"></div>
26
+ <div id="status" style="margin-top:1rem;color:#666;"></div>
27
 
28
  <script>
29
  const form = document.getElementById('genai-form');
30
+
31
+ // Audio streaming setup
32
+ let audioContext = null;
33
+ let nextStartTime = 0;
34
+ let audioQueue = [];
35
+ let isPlaying = false;
36
+
37
+ function initAudioContext() {
38
+ if (!audioContext) {
39
+ audioContext = new (window.AudioContext || window.webkitAudioContext)();
40
+ }
41
+ return audioContext;
42
+ }
43
+
44
+ function base64ToArrayBuffer(base64) {
45
+ const binaryString = atob(base64);
46
+ const bytes = new Uint8Array(binaryString.length);
47
+ for (let i = 0; i < binaryString.length; i++) {
48
+ bytes[i] = binaryString.charCodeAt(i);
49
+ }
50
+ return bytes.buffer;
51
+ }
52
+
53
+ async function playAudioChunk(wavBase64) {
54
+ const ctx = initAudioContext();
55
+ const arrayBuffer = base64ToArrayBuffer(wavBase64);
56
+
57
+ try {
58
+ const audioBuffer = await ctx.decodeAudioData(arrayBuffer);
59
+ const source = ctx.createBufferSource();
60
+ source.buffer = audioBuffer;
61
+ source.connect(ctx.destination);
62
+
63
+ const currentTime = ctx.currentTime;
64
+ const startTime = Math.max(currentTime, nextStartTime);
65
+ source.start(startTime);
66
+ nextStartTime = startTime + audioBuffer.duration;
67
+
68
+ return audioBuffer.duration;
69
+ } catch (err) {
70
+ console.error('Error playing audio chunk:', err);
71
+ return 0;
72
+ }
73
+ }
74
+
75
  form.addEventListener('submit', async e => {
76
  e.preventDefault();
77
  const out = document.getElementById('output');
78
  const audioDiv = document.getElementById('audio-out');
79
+ const status = document.getElementById('status');
80
+
81
+ out.textContent = 'Generating text…';
82
  audioDiv.innerHTML = '';
83
+ status.textContent = '';
84
+
85
+ // Reset audio state
86
+ if (audioContext) {
87
+ nextStartTime = audioContext.currentTime;
88
+ }
89
 
90
  const formData = new FormData(form);
91
+
92
  try {
93
+ const resp = await fetch('/generate_stream', { method: 'POST', body: formData });
94
+
 
95
  if (!resp.ok) {
96
+ out.textContent = 'Server error: ' + resp.statusText;
97
  return;
98
  }
99
 
100
+ const reader = resp.body.getReader();
101
+ const decoder = new TextDecoder();
102
+ let buffer = '';
103
+ let textReceived = false;
104
+ let audioChunks = 0;
105
 
106
+ while (true) {
107
+ const { done, value } = await reader.read();
108
+ if (done) break;
 
 
 
 
 
 
109
 
110
+ buffer += decoder.decode(value, { stream: true });
111
+ const lines = buffer.split('\\n');
112
+ buffer = lines.pop(); // Keep incomplete line in buffer
113
+
114
+ for (const line of lines) {
115
+ if (!line.trim() || !line.startsWith('data: ')) continue;
116
+
117
+ try {
118
+ const data = JSON.parse(line.slice(6));
119
+
120
+ if (data.error) {
121
+ out.textContent = 'Error: ' + data.error;
122
+ status.textContent = '';
123
+ return;
124
+ }
125
+
126
+ if (data.type === 'text') {
127
+ out.textContent = data.text;
128
+ textReceived = true;
129
+ status.textContent = 'Text received, generating audio...';
130
+ }
131
+
132
+ if (data.type === 'audio_chunk' && data.audio_base64) {
133
+ audioChunks++;
134
+ status.textContent = `Streaming audio... (chunk ${audioChunks})`;
135
+ await playAudioChunk(data.audio_base64);
136
+ }
137
+
138
+ if (data.type === 'complete') {
139
+ status.textContent = `Complete! Text: ${data.timings.text_seconds}s, TTS: ${data.timings.tts_seconds}s, Total: ${data.timings.total_seconds}s`;
140
+ }
141
+ } catch (err) {
142
+ console.error('Error parsing SSE:', err, line);
143
+ }
144
+ }
145
  }
146
  } catch (err) {
147
  console.error(err);
148
  out.textContent = 'Fetch error: ' + err.message;
149
+ status.textContent = '';
150
  }
151
  });
152
  </script>
 
154
  </html>
155
  """
156
 
157
+ client = genai.Client(api_key="AIzaSyDolbPUZBPUPvQUu-RGktJmvnUpkcEKIYo")
158
 
159
  def wrap_pcm_to_wav(pcm_data: bytes, sample_rate=24000, num_channels=1, bits_per_sample=16) -> bytes:
160
  byte_rate = sample_rate * num_channels * bits_per_sample // 8
 
180
  def index():
181
  return render_template_string(HTML)
182
 
183
+ @app.route('/generate_stream', methods=['POST'])
184
+ def generate_stream():
185
+ def generate():
186
+ t_start = time.perf_counter()
187
+ prompt = (request.form.get("text") or "").strip()
188
+ file = request.files.get("image")
189
+ voice = (request.form.get("voice") or "Sadachbia").strip()
190
+ accent = (request.form.get("accent") or "British").strip()
191
+ tone = (request.form.get("tone") or "casual and friendly").strip()
192
+
193
+ if not prompt and not file:
194
+ yield f"data: {jsonify({'error': 'No input provided'}).get_data(as_text=True)}\n\n"
195
+ return
196
+
197
+ # Build multimodal input
198
+ parts = []
199
+ if prompt:
200
+ parts.append(types.Part.from_text(text=prompt))
201
+ if file:
202
+ parts.append(types.Part.from_bytes(data=file.read(), mime_type=file.mimetype or "image/png"))
203
+
204
+ # 1) Generate text
205
+ t0 = time.perf_counter()
206
+ try:
207
+ gen_resp = client.models.generate_content(
208
+ model="gemini-2.5-flash-lite",
209
+ contents=[types.Content(role="user", parts=parts)],
210
+ config=types.GenerateContentConfig(response_mime_type="text/plain"),
211
+ )
212
+ except Exception as e:
213
+ yield f"data: {jsonify({'error': f'text generation failed: {str(e)}'}).get_data(as_text=True)}\n\n"
214
+ return
215
+ t1 = time.perf_counter()
216
+
217
+ final_text = extract_text(gen_resp)
218
+ if not final_text:
219
+ yield f"data: {jsonify({'error': 'Text generation returned empty'}).get_data(as_text=True)}\n\n"
220
+ return
221
+
222
+ # Send text immediately
223
+ yield f"data: {jsonify({'type': 'text', 'text': final_text}).get_data(as_text=True)}\n\n"
224
+
225
+ # 2) Stream TTS audio
226
+ style_prompt = f"Say the following in a {accent} accent with a {tone} tone:\n\n{final_text}"
227
+ tts_start = time.perf_counter()
228
+
229
+ try:
230
+ # Use streaming for TTS
231
+ tts_stream = client.models.generate_content_stream(
232
+ model="gemini-2.5-flash-preview-tts",
233
+ contents=[types.Content(role="user", parts=[types.Part.from_text(text=style_prompt)])],
234
+ config=types.GenerateContentConfig(
235
+ response_modalities=["AUDIO"],
236
+ speech_config=types.SpeechConfig(
237
+ voice_config=types.VoiceConfig(
238
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=voice)
239
+ )
240
+ )
241
+ )
242
+ )
243
+
244
+ for chunk in tts_stream:
245
+ for cand in getattr(chunk, "candidates", []) or []:
246
+ for p in getattr(cand.content, "parts", []):
247
+ if getattr(p, "inline_data", None) and p.inline_data.data:
248
+ pcm_bytes = p.inline_data.data
249
+ wav = wrap_pcm_to_wav(pcm_bytes)
250
+ audio_b64 = base64.b64encode(wav).decode("ascii")
251
+ yield f"data: {jsonify({'type': 'audio_chunk', 'audio_base64': audio_b64}).get_data(as_text=True)}\n\n"
252
+
253
+ except Exception as e:
254
+ yield f"data: {jsonify({'error': f'tts streaming failed: {str(e)}', 'text': final_text}).get_data(as_text=True)}\n\n"
255
+ return
256
+
257
+ tts_end = time.perf_counter()
258
+ t_total = time.perf_counter() - t_start
259
+
260
+ # Send completion signal
261
+ yield f"data: {jsonify({'type': 'complete', 'timings': {'text_seconds': round(t1 - t0, 3), 'tts_seconds': round(tts_end - tts_start, 3), 'total_seconds': round(t_total, 3)}}).get_data(as_text=True)}\n\n"
262
+
263
+ return Response(stream_with_context(generate()), mimetype='text/event-stream')
264
+
265
+ # Keep the original endpoint for compatibility
266
  @app.route('/generate', methods=['POST'])
267
  def generate():
268
  t_start = time.perf_counter()
 
275
  if not prompt and not file:
276
  return jsonify({"error": "No input provided"}), 400
277
 
 
278
  parts = []
279
  if prompt:
280
  parts.append(types.Part.from_text(text=prompt))
281
  if file:
282
  parts.append(types.Part.from_bytes(data=file.read(), mime_type=file.mimetype or "image/png"))
283
 
 
284
  t0 = time.perf_counter()
285
  try:
286
  gen_resp = client.models.generate_content(
 
296
  if not final_text:
297
  return jsonify({"error": "Text generation returned empty"}), 500
298
 
 
299
  style_prompt = f"Say the following in a {accent} accent with a {tone} tone:\n\n{final_text}"
300
  tts_start = time.perf_counter()
301
  try: