ai_text_plus_audio

Sleeping

App Files Files Community

Pepguy commited on Oct 4, 2025

Commit

83daf1c

verified ·

1 Parent(s): 3ae06f6

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -59

app.py CHANGED Viewed

@@ -1,8 +1,5 @@
 # pip install flask google-genai
-import os
-import time
-import base64
-import struct
 from flask import Flask, request, render_template_string, jsonify
 from google import genai
 from google.genai import types
@@ -12,12 +9,15 @@ app = Flask(__name__)
 HTML = """
 <!DOCTYPE html>
 <html>
-<head><meta charset="UTF-8"><title>Gemini Multi</title></head>
 <body style="font-family:sans-serif;padding:2rem;">
-  <h1>Gemini Multi (Text → TTS)</h1>
   <form id="genai-form" enctype="multipart/form-data">
     <textarea id="prompt" name="text" rows="6" cols="60" placeholder="Enter prompt"></textarea><br/><br/>
     <input type="file" id="image" name="image" accept="image/*" /><br/><br/>
     <button type="submit">Generate</button>
   </form>
@@ -49,21 +49,20 @@ HTML = """
       }
       if (data.timings) {
-out.textContent = 'text_seconds: ' + data.timings.text_seconds +
-', tts_seconds: ' + data.timings.tts_seconds +
-', total_seconds: ' + data.timings.total_seconds;
-       // return;
       }
-     // out.textContent = data.text || "(no text)";
       if (data.audio_base64) {
-        // create audio element and play
         const audio = document.createElement('audio');
         audio.controls = true;
         audio.src = "data:audio/wav;base64," + data.audio_base64;
         audioDiv.appendChild(audio);
-        // user-gesture triggered due to form submit — autoplay is allowed in that context in most browsers
         try { audio.play().catch(()=>{}); } catch(e){}
       } else {
         audioDiv.textContent = 'No audio returned';
@@ -78,35 +77,26 @@ out.textContent = 'text_seconds: ' + data.timings.text_seconds +
 </html>
 """
-# reuse a single client instance (do not recreate per request)
 client = genai.Client(api_key="AIzaSyDolbPUZBPUPvQUu-RGktJmvnUpkcEKIYo",)
-def wrap_pcm_to_wav(pcm_data: bytes, sample_rate: int = 24000, num_channels: int = 1, bits_per_sample: int = 16) -> bytes:
-    """Wrap raw PCM (little-endian) into a WAV header. Adjust sample_rate if your PCM isn't 24000."""
     byte_rate = sample_rate * num_channels * bits_per_sample // 8
     block_align = num_channels * bits_per_sample // 8
     data_size = len(pcm_data)
     header = b"RIFF" + struct.pack("<I", 36 + data_size) + b"WAVE"
-    header += b"fmt " + struct.pack("<IHHIIHH",
-        16, 1, num_channels, sample_rate, byte_rate, block_align, bits_per_sample
-    )
     header += b"data" + struct.pack("<I", data_size)
     return header + pcm_data
 def extract_text(resp) -> str:
-    """Robustly extract text from a generate_content response."""
-    # preferred shortcut
-    if getattr(resp, "text", None):
-        return resp.text
     parts_text = []
     for cand in getattr(resp, "candidates", []) or []:
         content = getattr(cand, "content", None)
-        # content might be an object with .parts or a list
-        parts = getattr(content, "parts", None) or (content if isinstance(content, (list, tuple)) else [])
-        for p in parts or []:
-            text = getattr(p, "text", None)
-            if text:
-                parts_text.append(text)
     return "\n".join(parts_text).strip()
 @app.route('/')
@@ -118,32 +108,29 @@ def generate():
     t_start = time.perf_counter()
     prompt = (request.form.get("text") or "").strip()
     file = request.files.get("image")
-    image_bytes = None
-    mime_type = None
-    if file:
-        image_bytes = file.read()
-        mime_type = file.mimetype
-    if not prompt and not image_bytes:
         return jsonify({"error": "No input provided"}), 400
-    # Build parts for the text model (multimodal)
     parts = []
     if prompt:
         parts.append(types.Part.from_text(text=prompt))
-    if image_bytes:
-        parts.append(types.Part.from_bytes(data=image_bytes, mime_type=mime_type or "image/png"))
-    # 1) Text generation (call a text-capable multimodal model)
     t0 = time.perf_counter()
     try:
         gen_resp = client.models.generate_content(
-            model="gemini-2.5-flash-lite",  # fast multimodal text model
             contents=[types.Content(role="user", parts=parts)],
             config=types.GenerateContentConfig(response_mime_type="text/plain"),
         )
     except Exception as e:
-        app.logger.exception("Text generation failed")
         return jsonify({"error": f"text generation failed: {str(e)}"}), 500
     t1 = time.perf_counter()
@@ -151,44 +138,35 @@ def generate():
     if not final_text:
         return jsonify({"error": "Text generation returned empty"}), 500
-    # 2) TTS: use the preview TTS model (sequential because TTS needs the generated text)
     tts_start = time.perf_counter()
     try:
         tts_resp = client.models.generate_content(
             model="gemini-2.5-flash-preview-tts",
-            contents=[types.Content(role="user", parts=[types.Part.from_text(text=final_text)])],
             config=types.GenerateContentConfig(
                 response_modalities=["AUDIO"],
                 speech_config=types.SpeechConfig(
                     voice_config=types.VoiceConfig(
-                        prebuilt_voice_config=types.PrebuiltVoiceConfig(
-                            voice_name="Sadachbia"  # change voice if you want
-                        )
                     )
                 )
             )
         )
     except Exception as e:
-        app.logger.exception("TTS call failed")
         return jsonify({"error": f"tts generation failed: {str(e)}", "text": final_text}), 500
     tts_end = time.perf_counter()
-    # extract raw PCM bytes from TTS response
     pcm_bytes = None
     for cand in getattr(tts_resp, "candidates", []) or []:
-        content = getattr(cand, "content", None)
-        parts = getattr(content, "parts", None) or (content if isinstance(content, (list, tuple)) else [])
-        for p in parts or []:
-            inline = getattr(p, "inline_data", None)
-            if inline and getattr(inline, "data", None):
-                pcm_bytes = inline.data
                 break
-        if pcm_bytes:
-            break
     if not pcm_bytes:
-        # TTS unexpectedly returned no audio — return text with an error
-        app.logger.error("TTS returned no inline_data")
         return jsonify({"error": "TTS returned no audio", "text": final_text}), 500
     wav = wrap_pcm_to_wav(pcm_bytes)

 # pip install flask google-genai
+import os, time, base64, struct
 from flask import Flask, request, render_template_string, jsonify
 from google import genai
 from google.genai import types
 HTML = """
 <!DOCTYPE html>
 <html>
+<head><meta charset="UTF-8"><title>Gemini Multi (Text → Styled TTS)</title></head>
 <body style="font-family:sans-serif;padding:2rem;">
+  <h1>Gemini Multi (Text + Image → Styled TTS)</h1>
   <form id="genai-form" enctype="multipart/form-data">
     <textarea id="prompt" name="text" rows="6" cols="60" placeholder="Enter prompt"></textarea><br/><br/>
     <input type="file" id="image" name="image" accept="image/*" /><br/><br/>
+    <label>Voice: <input id="voice" name="voice" value="Sadachbia" /></label><br/>
+    <label>Accent: <input id="accent" name="accent" value="British" /></label><br/>
+    <label>Tone: <input id="tone" name="tone" value="casual and friendly" /></label><br/><br/>
     <button type="submit">Generate</button>
   </form>
       }
       if (data.timings) {
+        out.textContent =
+          'text_seconds: ' + data.timings.text_seconds +
+          ', tts_seconds: ' + data.timings.tts_seconds +
+          ', total_seconds: ' + data.timings.total_seconds +
+          '\\n\\n' + (data.text || "(no text)");
+      } else {
+        out.textContent = data.text || "(no text)";
       }
       if (data.audio_base64) {
         const audio = document.createElement('audio');
         audio.controls = true;
         audio.src = "data:audio/wav;base64," + data.audio_base64;
         audioDiv.appendChild(audio);
         try { audio.play().catch(()=>{}); } catch(e){}
       } else {
         audioDiv.textContent = 'No audio returned';
 </html>
 """
 client = genai.Client(api_key="AIzaSyDolbPUZBPUPvQUu-RGktJmvnUpkcEKIYo",)
+def wrap_pcm_to_wav(pcm_data: bytes, sample_rate=24000, num_channels=1, bits_per_sample=16) -> bytes:
     byte_rate = sample_rate * num_channels * bits_per_sample // 8
     block_align = num_channels * bits_per_sample // 8
     data_size = len(pcm_data)
     header = b"RIFF" + struct.pack("<I", 36 + data_size) + b"WAVE"
+    header += b"fmt " + struct.pack("<IHHIIHH", 16, 1, num_channels, sample_rate, byte_rate, block_align, bits_per_sample)
     header += b"data" + struct.pack("<I", data_size)
     return header + pcm_data
 def extract_text(resp) -> str:
+    if getattr(resp, "text", None): return resp.text
     parts_text = []
     for cand in getattr(resp, "candidates", []) or []:
         content = getattr(cand, "content", None)
+        parts = getattr(content, "parts", None) or []
+        for p in parts:
+            if getattr(p, "text", None):
+                parts_text.append(p.text)
     return "\n".join(parts_text).strip()
 @app.route('/')
     t_start = time.perf_counter()
     prompt = (request.form.get("text") or "").strip()
     file = request.files.get("image")
+    voice = (request.form.get("voice") or "Sadachbia").strip()
+    accent = (request.form.get("accent") or "British").strip()
+    tone = (request.form.get("tone") or "casual and friendly").strip()
+    if not prompt and not file:
         return jsonify({"error": "No input provided"}), 400
+    # parts for multimodal input
     parts = []
     if prompt:
         parts.append(types.Part.from_text(text=prompt))
+    if file:
+        parts.append(types.Part.from_bytes(data=file.read(), mime_type=file.mimetype or "image/png"))
+    # 1) Generate text from multimodal input
     t0 = time.perf_counter()
     try:
         gen_resp = client.models.generate_content(
+            model="gemini-2.5-flash-lite",
             contents=[types.Content(role="user", parts=parts)],
             config=types.GenerateContentConfig(response_mime_type="text/plain"),
         )
     except Exception as e:
         return jsonify({"error": f"text generation failed: {str(e)}"}), 500
     t1 = time.perf_counter()
     if not final_text:
         return jsonify({"error": "Text generation returned empty"}), 500
+    # 2) Voice-prompted TTS
+    style_prompt = f"Say the following in a {accent} accent with a {tone} tone:\n\n{final_text}"
     tts_start = time.perf_counter()
     try:
         tts_resp = client.models.generate_content(
             model="gemini-2.5-flash-preview-tts",
+            contents=[types.Content(role="user", parts=[types.Part.from_text(text=style_prompt)])],
             config=types.GenerateContentConfig(
                 response_modalities=["AUDIO"],
                 speech_config=types.SpeechConfig(
                     voice_config=types.VoiceConfig(
+                        prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=voice)
                     )
                 )
             )
         )
     except Exception as e:
         return jsonify({"error": f"tts generation failed: {str(e)}", "text": final_text}), 500
     tts_end = time.perf_counter()
     pcm_bytes = None
     for cand in getattr(tts_resp, "candidates", []) or []:
+        for p in getattr(cand.content, "parts", []):
+            if getattr(p, "inline_data", None) and p.inline_data.data:
+                pcm_bytes = p.inline_data.data
                 break
+        if pcm_bytes: break
     if not pcm_bytes:
         return jsonify({"error": "TTS returned no audio", "text": final_text}), 500
     wav = wrap_pcm_to_wav(pcm_bytes)