Pepguy commited on
Commit
83daf1c
·
verified ·
1 Parent(s): 3ae06f6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -59
app.py CHANGED
@@ -1,8 +1,5 @@
1
  # pip install flask google-genai
2
- import os
3
- import time
4
- import base64
5
- import struct
6
  from flask import Flask, request, render_template_string, jsonify
7
  from google import genai
8
  from google.genai import types
@@ -12,12 +9,15 @@ app = Flask(__name__)
12
  HTML = """
13
  <!DOCTYPE html>
14
  <html>
15
- <head><meta charset="UTF-8"><title>Gemini Multi</title></head>
16
  <body style="font-family:sans-serif;padding:2rem;">
17
- <h1>Gemini Multi (Text → TTS)</h1>
18
  <form id="genai-form" enctype="multipart/form-data">
19
  <textarea id="prompt" name="text" rows="6" cols="60" placeholder="Enter prompt"></textarea><br/><br/>
20
  <input type="file" id="image" name="image" accept="image/*" /><br/><br/>
 
 
 
21
  <button type="submit">Generate</button>
22
  </form>
23
 
@@ -49,21 +49,20 @@ HTML = """
49
  }
50
 
51
  if (data.timings) {
52
- out.textContent = 'text_seconds: ' + data.timings.text_seconds +
53
- ', tts_seconds: ' + data.timings.tts_seconds +
54
- ', total_seconds: ' + data.timings.total_seconds;
55
-
56
- // return;
 
 
57
  }
58
 
59
- // out.textContent = data.text || "(no text)";
60
  if (data.audio_base64) {
61
- // create audio element and play
62
  const audio = document.createElement('audio');
63
  audio.controls = true;
64
  audio.src = "data:audio/wav;base64," + data.audio_base64;
65
  audioDiv.appendChild(audio);
66
- // user-gesture triggered due to form submit — autoplay is allowed in that context in most browsers
67
  try { audio.play().catch(()=>{}); } catch(e){}
68
  } else {
69
  audioDiv.textContent = 'No audio returned';
@@ -78,35 +77,26 @@ out.textContent = 'text_seconds: ' + data.timings.text_seconds +
78
  </html>
79
  """
80
 
81
- # reuse a single client instance (do not recreate per request)
82
  client = genai.Client(api_key="AIzaSyDolbPUZBPUPvQUu-RGktJmvnUpkcEKIYo",)
83
 
84
- def wrap_pcm_to_wav(pcm_data: bytes, sample_rate: int = 24000, num_channels: int = 1, bits_per_sample: int = 16) -> bytes:
85
- """Wrap raw PCM (little-endian) into a WAV header. Adjust sample_rate if your PCM isn't 24000."""
86
  byte_rate = sample_rate * num_channels * bits_per_sample // 8
87
  block_align = num_channels * bits_per_sample // 8
88
  data_size = len(pcm_data)
89
  header = b"RIFF" + struct.pack("<I", 36 + data_size) + b"WAVE"
90
- header += b"fmt " + struct.pack("<IHHIIHH",
91
- 16, 1, num_channels, sample_rate, byte_rate, block_align, bits_per_sample
92
- )
93
  header += b"data" + struct.pack("<I", data_size)
94
  return header + pcm_data
95
 
96
  def extract_text(resp) -> str:
97
- """Robustly extract text from a generate_content response."""
98
- # preferred shortcut
99
- if getattr(resp, "text", None):
100
- return resp.text
101
  parts_text = []
102
  for cand in getattr(resp, "candidates", []) or []:
103
  content = getattr(cand, "content", None)
104
- # content might be an object with .parts or a list
105
- parts = getattr(content, "parts", None) or (content if isinstance(content, (list, tuple)) else [])
106
- for p in parts or []:
107
- text = getattr(p, "text", None)
108
- if text:
109
- parts_text.append(text)
110
  return "\n".join(parts_text).strip()
111
 
112
  @app.route('/')
@@ -118,32 +108,29 @@ def generate():
118
  t_start = time.perf_counter()
119
  prompt = (request.form.get("text") or "").strip()
120
  file = request.files.get("image")
121
- image_bytes = None
122
- mime_type = None
123
- if file:
124
- image_bytes = file.read()
125
- mime_type = file.mimetype
126
 
127
- if not prompt and not image_bytes:
128
  return jsonify({"error": "No input provided"}), 400
129
 
130
- # Build parts for the text model (multimodal)
131
  parts = []
132
  if prompt:
133
  parts.append(types.Part.from_text(text=prompt))
134
- if image_bytes:
135
- parts.append(types.Part.from_bytes(data=image_bytes, mime_type=mime_type or "image/png"))
136
 
137
- # 1) Text generation (call a text-capable multimodal model)
138
  t0 = time.perf_counter()
139
  try:
140
  gen_resp = client.models.generate_content(
141
- model="gemini-2.5-flash-lite", # fast multimodal text model
142
  contents=[types.Content(role="user", parts=parts)],
143
  config=types.GenerateContentConfig(response_mime_type="text/plain"),
144
  )
145
  except Exception as e:
146
- app.logger.exception("Text generation failed")
147
  return jsonify({"error": f"text generation failed: {str(e)}"}), 500
148
  t1 = time.perf_counter()
149
 
@@ -151,44 +138,35 @@ def generate():
151
  if not final_text:
152
  return jsonify({"error": "Text generation returned empty"}), 500
153
 
154
- # 2) TTS: use the preview TTS model (sequential because TTS needs the generated text)
 
155
  tts_start = time.perf_counter()
156
  try:
157
  tts_resp = client.models.generate_content(
158
  model="gemini-2.5-flash-preview-tts",
159
- contents=[types.Content(role="user", parts=[types.Part.from_text(text=final_text)])],
160
  config=types.GenerateContentConfig(
161
  response_modalities=["AUDIO"],
162
  speech_config=types.SpeechConfig(
163
  voice_config=types.VoiceConfig(
164
- prebuilt_voice_config=types.PrebuiltVoiceConfig(
165
- voice_name="Sadachbia" # change voice if you want
166
- )
167
  )
168
  )
169
  )
170
  )
171
  except Exception as e:
172
- app.logger.exception("TTS call failed")
173
  return jsonify({"error": f"tts generation failed: {str(e)}", "text": final_text}), 500
174
  tts_end = time.perf_counter()
175
 
176
- # extract raw PCM bytes from TTS response
177
  pcm_bytes = None
178
  for cand in getattr(tts_resp, "candidates", []) or []:
179
- content = getattr(cand, "content", None)
180
- parts = getattr(content, "parts", None) or (content if isinstance(content, (list, tuple)) else [])
181
- for p in parts or []:
182
- inline = getattr(p, "inline_data", None)
183
- if inline and getattr(inline, "data", None):
184
- pcm_bytes = inline.data
185
  break
186
- if pcm_bytes:
187
- break
188
 
189
  if not pcm_bytes:
190
- # TTS unexpectedly returned no audio — return text with an error
191
- app.logger.error("TTS returned no inline_data")
192
  return jsonify({"error": "TTS returned no audio", "text": final_text}), 500
193
 
194
  wav = wrap_pcm_to_wav(pcm_bytes)
 
1
  # pip install flask google-genai
2
+ import os, time, base64, struct
 
 
 
3
  from flask import Flask, request, render_template_string, jsonify
4
  from google import genai
5
  from google.genai import types
 
9
  HTML = """
10
  <!DOCTYPE html>
11
  <html>
12
+ <head><meta charset="UTF-8"><title>Gemini Multi (Text → Styled TTS)</title></head>
13
  <body style="font-family:sans-serif;padding:2rem;">
14
+ <h1>Gemini Multi (Text + Image Styled TTS)</h1>
15
  <form id="genai-form" enctype="multipart/form-data">
16
  <textarea id="prompt" name="text" rows="6" cols="60" placeholder="Enter prompt"></textarea><br/><br/>
17
  <input type="file" id="image" name="image" accept="image/*" /><br/><br/>
18
+ <label>Voice: <input id="voice" name="voice" value="Sadachbia" /></label><br/>
19
+ <label>Accent: <input id="accent" name="accent" value="British" /></label><br/>
20
+ <label>Tone: <input id="tone" name="tone" value="casual and friendly" /></label><br/><br/>
21
  <button type="submit">Generate</button>
22
  </form>
23
 
 
49
  }
50
 
51
  if (data.timings) {
52
+ out.textContent =
53
+ 'text_seconds: ' + data.timings.text_seconds +
54
+ ', tts_seconds: ' + data.timings.tts_seconds +
55
+ ', total_seconds: ' + data.timings.total_seconds +
56
+ '\\n\\n' + (data.text || "(no text)");
57
+ } else {
58
+ out.textContent = data.text || "(no text)";
59
  }
60
 
 
61
  if (data.audio_base64) {
 
62
  const audio = document.createElement('audio');
63
  audio.controls = true;
64
  audio.src = "data:audio/wav;base64," + data.audio_base64;
65
  audioDiv.appendChild(audio);
 
66
  try { audio.play().catch(()=>{}); } catch(e){}
67
  } else {
68
  audioDiv.textContent = 'No audio returned';
 
77
  </html>
78
  """
79
 
 
80
  client = genai.Client(api_key="AIzaSyDolbPUZBPUPvQUu-RGktJmvnUpkcEKIYo",)
81
 
82
+ def wrap_pcm_to_wav(pcm_data: bytes, sample_rate=24000, num_channels=1, bits_per_sample=16) -> bytes:
 
83
  byte_rate = sample_rate * num_channels * bits_per_sample // 8
84
  block_align = num_channels * bits_per_sample // 8
85
  data_size = len(pcm_data)
86
  header = b"RIFF" + struct.pack("<I", 36 + data_size) + b"WAVE"
87
+ header += b"fmt " + struct.pack("<IHHIIHH", 16, 1, num_channels, sample_rate, byte_rate, block_align, bits_per_sample)
 
 
88
  header += b"data" + struct.pack("<I", data_size)
89
  return header + pcm_data
90
 
91
  def extract_text(resp) -> str:
92
+ if getattr(resp, "text", None): return resp.text
 
 
 
93
  parts_text = []
94
  for cand in getattr(resp, "candidates", []) or []:
95
  content = getattr(cand, "content", None)
96
+ parts = getattr(content, "parts", None) or []
97
+ for p in parts:
98
+ if getattr(p, "text", None):
99
+ parts_text.append(p.text)
 
 
100
  return "\n".join(parts_text).strip()
101
 
102
  @app.route('/')
 
108
  t_start = time.perf_counter()
109
  prompt = (request.form.get("text") or "").strip()
110
  file = request.files.get("image")
111
+ voice = (request.form.get("voice") or "Sadachbia").strip()
112
+ accent = (request.form.get("accent") or "British").strip()
113
+ tone = (request.form.get("tone") or "casual and friendly").strip()
 
 
114
 
115
+ if not prompt and not file:
116
  return jsonify({"error": "No input provided"}), 400
117
 
118
+ # parts for multimodal input
119
  parts = []
120
  if prompt:
121
  parts.append(types.Part.from_text(text=prompt))
122
+ if file:
123
+ parts.append(types.Part.from_bytes(data=file.read(), mime_type=file.mimetype or "image/png"))
124
 
125
+ # 1) Generate text from multimodal input
126
  t0 = time.perf_counter()
127
  try:
128
  gen_resp = client.models.generate_content(
129
+ model="gemini-2.5-flash-lite",
130
  contents=[types.Content(role="user", parts=parts)],
131
  config=types.GenerateContentConfig(response_mime_type="text/plain"),
132
  )
133
  except Exception as e:
 
134
  return jsonify({"error": f"text generation failed: {str(e)}"}), 500
135
  t1 = time.perf_counter()
136
 
 
138
  if not final_text:
139
  return jsonify({"error": "Text generation returned empty"}), 500
140
 
141
+ # 2) Voice-prompted TTS
142
+ style_prompt = f"Say the following in a {accent} accent with a {tone} tone:\n\n{final_text}"
143
  tts_start = time.perf_counter()
144
  try:
145
  tts_resp = client.models.generate_content(
146
  model="gemini-2.5-flash-preview-tts",
147
+ contents=[types.Content(role="user", parts=[types.Part.from_text(text=style_prompt)])],
148
  config=types.GenerateContentConfig(
149
  response_modalities=["AUDIO"],
150
  speech_config=types.SpeechConfig(
151
  voice_config=types.VoiceConfig(
152
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=voice)
 
 
153
  )
154
  )
155
  )
156
  )
157
  except Exception as e:
 
158
  return jsonify({"error": f"tts generation failed: {str(e)}", "text": final_text}), 500
159
  tts_end = time.perf_counter()
160
 
 
161
  pcm_bytes = None
162
  for cand in getattr(tts_resp, "candidates", []) or []:
163
+ for p in getattr(cand.content, "parts", []):
164
+ if getattr(p, "inline_data", None) and p.inline_data.data:
165
+ pcm_bytes = p.inline_data.data
 
 
 
166
  break
167
+ if pcm_bytes: break
 
168
 
169
  if not pcm_bytes:
 
 
170
  return jsonify({"error": "TTS returned no audio", "text": final_text}), 500
171
 
172
  wav = wrap_pcm_to_wav(pcm_bytes)