ai_voice_multimodal

Sleeping

App Files Files Community

Pepguy commited on Oct 4, 2025

Commit

a416fca

verified ·

1 Parent(s): a634309

Update app.py

Browse files

Files changed (1) hide show

app.py +119 -82

app.py CHANGED Viewed

@@ -1,25 +1,27 @@
-# pip install flask google-genai websockets
-import os, time, base64, struct, asyncio, json
-from flask import Flask, request, render_template_string, jsonify, Response, stream_with_context
 from google import genai
 from google.genai import types
-from google.genai.live import AsyncSession
-import threading
 from queue import Queue
 app = Flask(__name__)
 HTML = """
 <!DOCTYPE html>
 <html>
-<head><meta charset="UTF-8"><title>Gemini Live Streaming TTS</title></head>
 <body style="font-family:sans-serif;padding:2rem;">
-  <h1>Gemini Multi (Text + Image → Live Streaming TTS)</h1>
   <form id="genai-form" enctype="multipart/form-data">
     <textarea id="prompt" name="text" rows="6" cols="60" placeholder="Enter prompt"></textarea><br/><br/>
     <input type="file" id="image" name="image" accept="image/*" /><br/><br/>
     <label>Voice: <input id="voice" name="voice" value="Puck" /></label><br/>
-    <label>Instructions: <textarea id="instructions" name="instructions" rows="2" cols="60">Speak in a friendly, conversational tone.</textarea></label><br/><br/>
     <button type="submit">Generate</button>
   </form>
@@ -33,7 +35,6 @@ HTML = """
   // Audio streaming setup
   let audioContext = null;
   let nextStartTime = 0;
-  let isPlaying = false;
   let chunksReceived = 0;
   function initAudioContext() {
@@ -93,7 +94,7 @@ HTML = """
     const formData = new FormData(form);
     try {
-      const resp = await fetch('/generate_live', { method: 'POST', body: formData });
       if (!resp.ok) {
         out.textContent = 'Server error: ' + resp.statusText;
@@ -129,22 +130,22 @@ HTML = """
             if (data.type === 'text') {
               out.textContent = data.text;
               textReceived = true;
-              status.textContent = 'Text received, streaming audio...';
             }
             if (data.type === 'audio_chunk' && data.audio_base64) {
               chunksReceived++;
               if (!firstAudioTime) {
                 firstAudioTime = Date.now();
-                status.textContent = `First audio chunk received! Streaming...`;
               } else {
-                status.textContent = `Streaming... (${chunksReceived} chunks)`;
               }
               await playAudioChunk(data.audio_base64);
             }
             if (data.type === 'complete') {
-              status.textContent = `Complete! Received ${chunksReceived} audio chunks. Text: ${data.timings.text_seconds}s, TTS: ${data.timings.tts_seconds}s, Total: ${data.timings.total_seconds}s`;
             }
           } catch (err) {
             console.error('Error parsing SSE:', err, line);
@@ -184,18 +185,72 @@ def extract_text(resp) -> str:
                 parts_text.append(p.text)
     return "\n".join(parts_text).strip()
 @app.route('/')
 def index():
     return render_template_string(HTML)
-@app.route('/generate_live', methods=['POST'])
-def generate_live():
     def generate():
         t_start = time.perf_counter()
         prompt = (request.form.get("text") or "").strip()
         file = request.files.get("image")
         voice = (request.form.get("voice") or "Puck").strip()
-        instructions = (request.form.get("instructions") or "Speak in a friendly, conversational tone.").strip()
         if not prompt and not file:
             yield f"data: {json.dumps({'error': 'No input provided'})}\n\n"
@@ -206,10 +261,9 @@ def generate_live():
         if prompt:
             parts.append(types.Part.from_text(text=prompt))
         if file:
-            file_data = file.read()
-            parts.append(types.Part.from_bytes(data=file_data, mime_type=file.mimetype or "image/png"))
-        # 1) Generate text from multimodal input
         t0 = time.perf_counter()
         try:
             gen_resp = client.models.generate_content(
@@ -227,79 +281,62 @@ def generate_live():
             yield f"data: {json.dumps({'error': 'Text generation returned empty'})}\n\n"
             return
-        # Send text immediately
-        yield f"data: {json.dumps({'type': 'text', 'text': final_text})}\n\n"
-        # 2) Use Live API for true streaming TTS
         tts_start = time.perf_counter()
-        # Create a queue to pass audio chunks from async context
         audio_queue = Queue()
-        error_queue = Queue()
-        async def run_live_session():
-            try:
-                async with client.aio.live.connect(
-                    model="gemini-2.5-pro-preview-tts",
-                    config={
-                        "generation_config": {
-                            "response_modalities": ["AUDIO"],
-                            "speech_config": {
-                                "voice_config": {
-                                    "prebuilt_voice_config": {
-                                        "voice_name": voice
-                                    }
-                                }
-                            }
-                        },
-                        "system_instruction": instructions
-                    }
-                ) as session:
-                    # Send the text to be spoken
-                    await session.send(final_text, end_of_turn=True)
-                    # Receive streaming audio chunks
-                    async for response in session.receive():
-                        if response.server_content:
-                            for part in response.server_content.model_turn.parts:
-                                if part.inline_data and part.inline_data.mime_type.startswith("audio/"):
-                                    audio_queue.put(part.inline_data.data)
-                        if response.server_content and response.server_content.turn_complete:
-                            break
-                    audio_queue.put(None)  # Signal completion
-            except Exception as e:
-                error_queue.put(str(e))
-                audio_queue.put(None)
-        # Run async session in thread
-        def run_async_in_thread():
-            loop = asyncio.new_event_loop()
-            asyncio.set_event_loop(loop)
-            loop.run_until_complete(run_live_session())
-            loop.close()
-        thread = threading.Thread(target=run_async_in_thread)
-        thread.start()
-        # Stream audio chunks as they arrive
         while True:
-            if not error_queue.empty():
-                error = error_queue.get()
-                yield f"data: {json.dumps({'error': f'Live API failed: {error}', 'text': final_text})}\n\n"
-                break
-            pcm_bytes = audio_queue.get()
-            if pcm_bytes is None:
                 break
-            wav = wrap_pcm_to_wav(pcm_bytes)
-            audio_b64 = base64.b64encode(wav).decode("ascii")
-            yield f"data: {json.dumps({'type': 'audio_chunk', 'audio_base64': audio_b64})}\n\n"
-        thread.join()
         tts_end = time.perf_counter()
         t_total = time.perf_counter() - t_start

+# pip install flask google-genai
+import os, time, base64, struct, re
+from flask import Flask, request, render_template_string, Response, stream_with_context
 from google import genai
 from google.genai import types
+import json
+from concurrent.futures import ThreadPoolExecutor
 from queue import Queue
+import threading
 app = Flask(__name__)
 HTML = """
 <!DOCTYPE html>
 <html>
+<head><meta charset="UTF-8"><title>Gemini Multi (Text → Chunked Streaming TTS)</title></head>
 <body style="font-family:sans-serif;padding:2rem;">
+  <h1>Gemini Multi (Text + Image → Chunked Streaming TTS)</h1>
   <form id="genai-form" enctype="multipart/form-data">
     <textarea id="prompt" name="text" rows="6" cols="60" placeholder="Enter prompt"></textarea><br/><br/>
     <input type="file" id="image" name="image" accept="image/*" /><br/><br/>
     <label>Voice: <input id="voice" name="voice" value="Puck" /></label><br/>
+    <label>Accent: <input id="accent" name="accent" value="British" /></label><br/>
+    <label>Tone: <input id="tone" name="tone" value="casual and friendly" /></label><br/><br/>
     <button type="submit">Generate</button>
   </form>
   // Audio streaming setup
   let audioContext = null;
   let nextStartTime = 0;
   let chunksReceived = 0;
   function initAudioContext() {
     const formData = new FormData(form);
     try {
+      const resp = await fetch('/generate_stream', { method: 'POST', body: formData });
       if (!resp.ok) {
         out.textContent = 'Server error: ' + resp.statusText;
             if (data.type === 'text') {
               out.textContent = data.text;
               textReceived = true;
+              status.textContent = `Text received (${data.chunk_count} chunks), generating audio...`;
             }
             if (data.type === 'audio_chunk' && data.audio_base64) {
               chunksReceived++;
               if (!firstAudioTime) {
                 firstAudioTime = Date.now();
+                status.textContent = `First audio chunk received! Playing...`;
               } else {
+                status.textContent = `Streaming audio... (${chunksReceived} chunks received)`;
               }
               await playAudioChunk(data.audio_base64);
             }
             if (data.type === 'complete') {
+              status.textContent = `Complete! ${chunksReceived} audio chunks. Text: ${data.timings.text_seconds}s, TTS: ${data.timings.tts_seconds}s, Total: ${data.timings.total_seconds}s`;
             }
           } catch (err) {
             console.error('Error parsing SSE:', err, line);
                 parts_text.append(p.text)
     return "\n".join(parts_text).strip()
+def chunk_text(text, max_words=25):
+    """Split text into sentence-based chunks for parallel TTS generation."""
+    # Split on sentence boundaries
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    chunks = []
+    current_chunk = []
+    current_words = 0
+    for sentence in sentences:
+        words = len(sentence.split())
+        if current_words + words > max_words and current_chunk:
+            chunks.append(' '.join(current_chunk))
+            current_chunk = [sentence]
+            current_words = words
+        else:
+            current_chunk.append(sentence)
+            current_words += words
+    if current_chunk:
+        chunks.append(' '.join(current_chunk))
+    return chunks
+def generate_audio_for_chunk(chunk, voice, accent, tone, chunk_index):
+    """Generate audio for a single text chunk."""
+    style_prompt = f"Say the following in a {accent} accent with a {tone} tone:\n\n{chunk}"
+    try:
+        tts_resp = client.models.generate_content(
+            model="gemini-2.5-flash-preview-tts",
+            contents=[types.Content(role="user", parts=[types.Part.from_text(text=style_prompt)])],
+            config=types.GenerateContentConfig(
+                response_modalities=["AUDIO"],
+                speech_config=types.SpeechConfig(
+                    voice_config=types.VoiceConfig(
+                        prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=voice)
+                    )
+                )
+            )
+        )
+        for cand in getattr(tts_resp, "candidates", []) or []:
+            for p in getattr(cand.content, "parts", []):
+                if getattr(p, "inline_data", None) and p.inline_data.data:
+                    pcm_bytes = p.inline_data.data
+                    wav = wrap_pcm_to_wav(pcm_bytes)
+                    return (chunk_index, wav)
+        return (chunk_index, None)
+    except Exception as e:
+        print(f"Error generating audio for chunk {chunk_index}: {e}")
+        return (chunk_index, None)
 @app.route('/')
 def index():
     return render_template_string(HTML)
+@app.route('/generate_stream', methods=['POST'])
+def generate_stream():
     def generate():
         t_start = time.perf_counter()
         prompt = (request.form.get("text") or "").strip()
         file = request.files.get("image")
         voice = (request.form.get("voice") or "Puck").strip()
+        accent = (request.form.get("accent") or "British").strip()
+        tone = (request.form.get("tone") or "casual and friendly").strip()
         if not prompt and not file:
             yield f"data: {json.dumps({'error': 'No input provided'})}\n\n"
         if prompt:
             parts.append(types.Part.from_text(text=prompt))
         if file:
+            parts.append(types.Part.from_bytes(data=file.read(), mime_type=file.mimetype or "image/png"))
+        # 1) Generate text
         t0 = time.perf_counter()
         try:
             gen_resp = client.models.generate_content(
             yield f"data: {json.dumps({'error': 'Text generation returned empty'})}\n\n"
             return
+        # Split text into chunks for parallel processing
+        text_chunks = chunk_text(final_text)
+        # Send text immediately with chunk count
+        yield f"data: {json.dumps({'type': 'text', 'text': final_text, 'chunk_count': len(text_chunks)})}\n\n"
+        # 2) Generate audio chunks in parallel with streaming
         tts_start = time.perf_counter()
         audio_queue = Queue()
+        def process_chunks():
+            with ThreadPoolExecutor(max_workers=4) as executor:
+                futures = []
+                for i, chunk in enumerate(text_chunks):
+                    future = executor.submit(generate_audio_for_chunk, chunk, voice, accent, tone, i)
+                    futures.append(future)
+                # Process results as they complete (not necessarily in order)
+                for future in futures:
+                    try:
+                        result = future.result(timeout=30)
+                        audio_queue.put(result)
+                    except Exception as e:
+                        print(f"Error in chunk processing: {e}")
+                        audio_queue.put((None, None))
+            audio_queue.put(('DONE', None))
+        # Start parallel processing in background thread
+        processing_thread = threading.Thread(target=process_chunks)
+        processing_thread.start()
+        # Stream audio chunks as they become available (in order)
+        completed_chunks = {}
+        next_chunk_to_send = 0
         while True:
+            chunk_index, wav_data = audio_queue.get()
+            if chunk_index == 'DONE':
                 break
+            if wav_data is None:
+                continue
+            # Store completed chunk
+            completed_chunks[chunk_index] = wav_data
+            # Send chunks in order
+            while next_chunk_to_send in completed_chunks:
+                audio_b64 = base64.b64encode(completed_chunks[next_chunk_to_send]).decode("ascii")
+                yield f"data: {json.dumps({'type': 'audio_chunk', 'audio_base64': audio_b64, 'chunk_index': next_chunk_to_send})}\n\n"
+                del completed_chunks[next_chunk_to_send]
+                next_chunk_to_send += 1
+        processing_thread.join()
         tts_end = time.perf_counter()
         t_total = time.perf_counter() - t_start