Pepguy commited on
Commit
a416fca
·
verified ·
1 Parent(s): a634309

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +119 -82
app.py CHANGED
@@ -1,25 +1,27 @@
1
- # pip install flask google-genai websockets
2
- import os, time, base64, struct, asyncio, json
3
- from flask import Flask, request, render_template_string, jsonify, Response, stream_with_context
4
  from google import genai
5
  from google.genai import types
6
- from google.genai.live import AsyncSession
7
- import threading
8
  from queue import Queue
 
9
 
10
  app = Flask(__name__)
11
 
12
  HTML = """
13
  <!DOCTYPE html>
14
  <html>
15
- <head><meta charset="UTF-8"><title>Gemini Live Streaming TTS</title></head>
16
  <body style="font-family:sans-serif;padding:2rem;">
17
- <h1>Gemini Multi (Text + Image → Live Streaming TTS)</h1>
18
  <form id="genai-form" enctype="multipart/form-data">
19
  <textarea id="prompt" name="text" rows="6" cols="60" placeholder="Enter prompt"></textarea><br/><br/>
20
  <input type="file" id="image" name="image" accept="image/*" /><br/><br/>
21
  <label>Voice: <input id="voice" name="voice" value="Puck" /></label><br/>
22
- <label>Instructions: <textarea id="instructions" name="instructions" rows="2" cols="60">Speak in a friendly, conversational tone.</textarea></label><br/><br/>
 
23
  <button type="submit">Generate</button>
24
  </form>
25
 
@@ -33,7 +35,6 @@ HTML = """
33
  // Audio streaming setup
34
  let audioContext = null;
35
  let nextStartTime = 0;
36
- let isPlaying = false;
37
  let chunksReceived = 0;
38
 
39
  function initAudioContext() {
@@ -93,7 +94,7 @@ HTML = """
93
  const formData = new FormData(form);
94
 
95
  try {
96
- const resp = await fetch('/generate_live', { method: 'POST', body: formData });
97
 
98
  if (!resp.ok) {
99
  out.textContent = 'Server error: ' + resp.statusText;
@@ -129,22 +130,22 @@ HTML = """
129
  if (data.type === 'text') {
130
  out.textContent = data.text;
131
  textReceived = true;
132
- status.textContent = 'Text received, streaming audio...';
133
  }
134
 
135
  if (data.type === 'audio_chunk' && data.audio_base64) {
136
  chunksReceived++;
137
  if (!firstAudioTime) {
138
  firstAudioTime = Date.now();
139
- status.textContent = `First audio chunk received! Streaming...`;
140
  } else {
141
- status.textContent = `Streaming... (${chunksReceived} chunks)`;
142
  }
143
  await playAudioChunk(data.audio_base64);
144
  }
145
 
146
  if (data.type === 'complete') {
147
- status.textContent = `Complete! Received ${chunksReceived} audio chunks. Text: ${data.timings.text_seconds}s, TTS: ${data.timings.tts_seconds}s, Total: ${data.timings.total_seconds}s`;
148
  }
149
  } catch (err) {
150
  console.error('Error parsing SSE:', err, line);
@@ -184,18 +185,72 @@ def extract_text(resp) -> str:
184
  parts_text.append(p.text)
185
  return "\n".join(parts_text).strip()
186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  @app.route('/')
188
  def index():
189
  return render_template_string(HTML)
190
 
191
- @app.route('/generate_live', methods=['POST'])
192
- def generate_live():
193
  def generate():
194
  t_start = time.perf_counter()
195
  prompt = (request.form.get("text") or "").strip()
196
  file = request.files.get("image")
197
  voice = (request.form.get("voice") or "Puck").strip()
198
- instructions = (request.form.get("instructions") or "Speak in a friendly, conversational tone.").strip()
 
199
 
200
  if not prompt and not file:
201
  yield f"data: {json.dumps({'error': 'No input provided'})}\n\n"
@@ -206,10 +261,9 @@ def generate_live():
206
  if prompt:
207
  parts.append(types.Part.from_text(text=prompt))
208
  if file:
209
- file_data = file.read()
210
- parts.append(types.Part.from_bytes(data=file_data, mime_type=file.mimetype or "image/png"))
211
 
212
- # 1) Generate text from multimodal input
213
  t0 = time.perf_counter()
214
  try:
215
  gen_resp = client.models.generate_content(
@@ -227,79 +281,62 @@ def generate_live():
227
  yield f"data: {json.dumps({'error': 'Text generation returned empty'})}\n\n"
228
  return
229
 
230
- # Send text immediately
231
- yield f"data: {json.dumps({'type': 'text', 'text': final_text})}\n\n"
 
 
 
232
 
233
- # 2) Use Live API for true streaming TTS
234
  tts_start = time.perf_counter()
235
-
236
- # Create a queue to pass audio chunks from async context
237
  audio_queue = Queue()
238
- error_queue = Queue()
239
 
240
- async def run_live_session():
241
- try:
242
- async with client.aio.live.connect(
243
- model="gemini-2.5-pro-preview-tts",
244
- config={
245
- "generation_config": {
246
- "response_modalities": ["AUDIO"],
247
- "speech_config": {
248
- "voice_config": {
249
- "prebuilt_voice_config": {
250
- "voice_name": voice
251
- }
252
- }
253
- }
254
- },
255
- "system_instruction": instructions
256
- }
257
- ) as session:
258
- # Send the text to be spoken
259
- await session.send(final_text, end_of_turn=True)
260
-
261
- # Receive streaming audio chunks
262
- async for response in session.receive():
263
- if response.server_content:
264
- for part in response.server_content.model_turn.parts:
265
- if part.inline_data and part.inline_data.mime_type.startswith("audio/"):
266
- audio_queue.put(part.inline_data.data)
267
-
268
- if response.server_content and response.server_content.turn_complete:
269
- break
270
-
271
- audio_queue.put(None) # Signal completion
272
-
273
- except Exception as e:
274
- error_queue.put(str(e))
275
- audio_queue.put(None)
276
 
277
- # Run async session in thread
278
- def run_async_in_thread():
279
- loop = asyncio.new_event_loop()
280
- asyncio.set_event_loop(loop)
281
- loop.run_until_complete(run_live_session())
282
- loop.close()
283
 
284
- thread = threading.Thread(target=run_async_in_thread)
285
- thread.start()
 
286
 
287
- # Stream audio chunks as they arrive
288
  while True:
289
- if not error_queue.empty():
290
- error = error_queue.get()
291
- yield f"data: {json.dumps({'error': f'Live API failed: {error}', 'text': final_text})}\n\n"
292
- break
293
-
294
- pcm_bytes = audio_queue.get()
295
- if pcm_bytes is None:
296
  break
297
 
298
- wav = wrap_pcm_to_wav(pcm_bytes)
299
- audio_b64 = base64.b64encode(wav).decode("ascii")
300
- yield f"data: {json.dumps({'type': 'audio_chunk', 'audio_base64': audio_b64})}\n\n"
 
 
 
 
 
 
 
 
 
301
 
302
- thread.join()
303
  tts_end = time.perf_counter()
304
  t_total = time.perf_counter() - t_start
305
 
 
1
+ # pip install flask google-genai
2
+ import os, time, base64, struct, re
3
+ from flask import Flask, request, render_template_string, Response, stream_with_context
4
  from google import genai
5
  from google.genai import types
6
+ import json
7
+ from concurrent.futures import ThreadPoolExecutor
8
  from queue import Queue
9
+ import threading
10
 
11
  app = Flask(__name__)
12
 
13
  HTML = """
14
  <!DOCTYPE html>
15
  <html>
16
+ <head><meta charset="UTF-8"><title>Gemini Multi (Text → Chunked Streaming TTS)</title></head>
17
  <body style="font-family:sans-serif;padding:2rem;">
18
+ <h1>Gemini Multi (Text + Image → Chunked Streaming TTS)</h1>
19
  <form id="genai-form" enctype="multipart/form-data">
20
  <textarea id="prompt" name="text" rows="6" cols="60" placeholder="Enter prompt"></textarea><br/><br/>
21
  <input type="file" id="image" name="image" accept="image/*" /><br/><br/>
22
  <label>Voice: <input id="voice" name="voice" value="Puck" /></label><br/>
23
+ <label>Accent: <input id="accent" name="accent" value="British" /></label><br/>
24
+ <label>Tone: <input id="tone" name="tone" value="casual and friendly" /></label><br/><br/>
25
  <button type="submit">Generate</button>
26
  </form>
27
 
 
35
  // Audio streaming setup
36
  let audioContext = null;
37
  let nextStartTime = 0;
 
38
  let chunksReceived = 0;
39
 
40
  function initAudioContext() {
 
94
  const formData = new FormData(form);
95
 
96
  try {
97
+ const resp = await fetch('/generate_stream', { method: 'POST', body: formData });
98
 
99
  if (!resp.ok) {
100
  out.textContent = 'Server error: ' + resp.statusText;
 
130
  if (data.type === 'text') {
131
  out.textContent = data.text;
132
  textReceived = true;
133
+ status.textContent = `Text received (${data.chunk_count} chunks), generating audio...`;
134
  }
135
 
136
  if (data.type === 'audio_chunk' && data.audio_base64) {
137
  chunksReceived++;
138
  if (!firstAudioTime) {
139
  firstAudioTime = Date.now();
140
+ status.textContent = `First audio chunk received! Playing...`;
141
  } else {
142
+ status.textContent = `Streaming audio... (${chunksReceived} chunks received)`;
143
  }
144
  await playAudioChunk(data.audio_base64);
145
  }
146
 
147
  if (data.type === 'complete') {
148
+ status.textContent = `Complete! ${chunksReceived} audio chunks. Text: ${data.timings.text_seconds}s, TTS: ${data.timings.tts_seconds}s, Total: ${data.timings.total_seconds}s`;
149
  }
150
  } catch (err) {
151
  console.error('Error parsing SSE:', err, line);
 
185
  parts_text.append(p.text)
186
  return "\n".join(parts_text).strip()
187
 
188
+ def chunk_text(text, max_words=25):
189
+ """Split text into sentence-based chunks for parallel TTS generation."""
190
+ # Split on sentence boundaries
191
+ sentences = re.split(r'(?<=[.!?])\s+', text)
192
+
193
+ chunks = []
194
+ current_chunk = []
195
+ current_words = 0
196
+
197
+ for sentence in sentences:
198
+ words = len(sentence.split())
199
+ if current_words + words > max_words and current_chunk:
200
+ chunks.append(' '.join(current_chunk))
201
+ current_chunk = [sentence]
202
+ current_words = words
203
+ else:
204
+ current_chunk.append(sentence)
205
+ current_words += words
206
+
207
+ if current_chunk:
208
+ chunks.append(' '.join(current_chunk))
209
+
210
+ return chunks
211
+
212
+ def generate_audio_for_chunk(chunk, voice, accent, tone, chunk_index):
213
+ """Generate audio for a single text chunk."""
214
+ style_prompt = f"Say the following in a {accent} accent with a {tone} tone:\n\n{chunk}"
215
+ try:
216
+ tts_resp = client.models.generate_content(
217
+ model="gemini-2.5-flash-preview-tts",
218
+ contents=[types.Content(role="user", parts=[types.Part.from_text(text=style_prompt)])],
219
+ config=types.GenerateContentConfig(
220
+ response_modalities=["AUDIO"],
221
+ speech_config=types.SpeechConfig(
222
+ voice_config=types.VoiceConfig(
223
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=voice)
224
+ )
225
+ )
226
+ )
227
+ )
228
+
229
+ for cand in getattr(tts_resp, "candidates", []) or []:
230
+ for p in getattr(cand.content, "parts", []):
231
+ if getattr(p, "inline_data", None) and p.inline_data.data:
232
+ pcm_bytes = p.inline_data.data
233
+ wav = wrap_pcm_to_wav(pcm_bytes)
234
+ return (chunk_index, wav)
235
+
236
+ return (chunk_index, None)
237
+ except Exception as e:
238
+ print(f"Error generating audio for chunk {chunk_index}: {e}")
239
+ return (chunk_index, None)
240
+
241
  @app.route('/')
242
  def index():
243
  return render_template_string(HTML)
244
 
245
+ @app.route('/generate_stream', methods=['POST'])
246
+ def generate_stream():
247
  def generate():
248
  t_start = time.perf_counter()
249
  prompt = (request.form.get("text") or "").strip()
250
  file = request.files.get("image")
251
  voice = (request.form.get("voice") or "Puck").strip()
252
+ accent = (request.form.get("accent") or "British").strip()
253
+ tone = (request.form.get("tone") or "casual and friendly").strip()
254
 
255
  if not prompt and not file:
256
  yield f"data: {json.dumps({'error': 'No input provided'})}\n\n"
 
261
  if prompt:
262
  parts.append(types.Part.from_text(text=prompt))
263
  if file:
264
+ parts.append(types.Part.from_bytes(data=file.read(), mime_type=file.mimetype or "image/png"))
 
265
 
266
+ # 1) Generate text
267
  t0 = time.perf_counter()
268
  try:
269
  gen_resp = client.models.generate_content(
 
281
  yield f"data: {json.dumps({'error': 'Text generation returned empty'})}\n\n"
282
  return
283
 
284
+ # Split text into chunks for parallel processing
285
+ text_chunks = chunk_text(final_text)
286
+
287
+ # Send text immediately with chunk count
288
+ yield f"data: {json.dumps({'type': 'text', 'text': final_text, 'chunk_count': len(text_chunks)})}\n\n"
289
 
290
+ # 2) Generate audio chunks in parallel with streaming
291
  tts_start = time.perf_counter()
 
 
292
  audio_queue = Queue()
 
293
 
294
+ def process_chunks():
295
+ with ThreadPoolExecutor(max_workers=4) as executor:
296
+ futures = []
297
+ for i, chunk in enumerate(text_chunks):
298
+ future = executor.submit(generate_audio_for_chunk, chunk, voice, accent, tone, i)
299
+ futures.append(future)
300
+
301
+ # Process results as they complete (not necessarily in order)
302
+ for future in futures:
303
+ try:
304
+ result = future.result(timeout=30)
305
+ audio_queue.put(result)
306
+ except Exception as e:
307
+ print(f"Error in chunk processing: {e}")
308
+ audio_queue.put((None, None))
309
+
310
+ audio_queue.put(('DONE', None))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
 
312
+ # Start parallel processing in background thread
313
+ processing_thread = threading.Thread(target=process_chunks)
314
+ processing_thread.start()
 
 
 
315
 
316
+ # Stream audio chunks as they become available (in order)
317
+ completed_chunks = {}
318
+ next_chunk_to_send = 0
319
 
 
320
  while True:
321
+ chunk_index, wav_data = audio_queue.get()
322
+
323
+ if chunk_index == 'DONE':
 
 
 
 
324
  break
325
 
326
+ if wav_data is None:
327
+ continue
328
+
329
+ # Store completed chunk
330
+ completed_chunks[chunk_index] = wav_data
331
+
332
+ # Send chunks in order
333
+ while next_chunk_to_send in completed_chunks:
334
+ audio_b64 = base64.b64encode(completed_chunks[next_chunk_to_send]).decode("ascii")
335
+ yield f"data: {json.dumps({'type': 'audio_chunk', 'audio_base64': audio_b64, 'chunk_index': next_chunk_to_send})}\n\n"
336
+ del completed_chunks[next_chunk_to_send]
337
+ next_chunk_to_send += 1
338
 
339
+ processing_thread.join()
340
  tts_end = time.perf_counter()
341
  t_total = time.perf_counter() - t_start
342