rairo commited on
Commit
57aa416
Β·
verified Β·
1 Parent(s): b65dc51

Narration with Gemini 2

Browse files
Files changed (1) hide show
  1. app.py +70 -95
app.py CHANGED
@@ -6,8 +6,8 @@ from google.genai import types
6
  import re
7
  import time
8
  import os
 
9
  import wave
10
- import base64
11
 
12
  # Disable Streamlit analytics (prevents PermissionError in some environments)
13
  os.environ["STREAMLIT_ANALYTICS_ENABLED"] = "false"
@@ -36,7 +36,8 @@ except Exception as e:
36
  # 1.3 Constants (model IDs, exactly as in original code)
37
  CATEGORY_MODEL = "gemini-2.0-flash-exp"
38
  GENERATION_MODEL = "gemini-2.0-flash-exp-image-generation"
39
- TTS_MODEL = "gemini-2.5-flash-preview-tts"
 
40
 
41
  # 1.4 Helper to parse numbered steps out of Gemini text
42
  def parse_numbered_steps(text):
@@ -49,51 +50,15 @@ def parse_numbered_steps(text):
49
  steps = re.findall(r"\n\s*(\d+)\.\s*(.*)", text, re.MULTILINE)
50
  return [(int(num), desc.strip()) for num, desc in steps]
51
 
52
- # 1.5 TTS Helper Functions
53
- def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
54
- """Create a wave file from PCM data"""
55
- with wave.open(filename, "wb") as wf:
56
  wf.setnchannels(channels)
57
  wf.setsampwidth(sample_width)
58
  wf.setframerate(rate)
59
  wf.writeframes(pcm)
60
-
61
- def generate_speech(text, voice_name='Kore'):
62
- """Generate speech from text using Gemini TTS"""
63
- try:
64
- response = client.models.generate_content(
65
- model=TTS_MODEL,
66
- contents=f"Say in a clear, helpful tone: {text}",
67
- config=types.GenerateContentConfig(
68
- response_modalities=["AUDIO"],
69
- speech_config=types.SpeechConfig(
70
- voice_config=types.VoiceConfig(
71
- prebuilt_voice_config=types.PrebuiltVoiceConfig(
72
- voice_name=voice_name,
73
- )
74
- )
75
- ),
76
- )
77
- )
78
-
79
- audio_data = response.candidates[0].content.parts[0].inline_data.data
80
- return audio_data
81
- except Exception as e:
82
- st.error(f"TTS generation failed: {str(e)}")
83
- return None
84
-
85
- def create_audio_player(audio_data, key):
86
- """Create an audio player widget for the generated speech"""
87
- if audio_data:
88
- # Convert audio data to base64 for HTML audio player
89
- audio_b64 = base64.b64encode(audio_data).decode()
90
- audio_html = f"""
91
- <audio controls style="width: 100%;">
92
- <source src="data:audio/wav;base64,{audio_b64}" type="audio/wav">
93
- Your browser does not support the audio element.
94
- </audio>
95
- """
96
- st.markdown(audio_html, unsafe_allow_html=True)
97
 
98
  # ─────────────────────────────────────────────────────────────────────────────
99
  # 2. SESSION STATE SETUP
@@ -106,7 +71,7 @@ if "app_state" not in st.session_state:
106
  "prompt_sent": False, "timer_running": {}, "last_tick": {},
107
  "project_title": "", "project_description": "", "upcycling_options": [],
108
  "plan_approved": False, "initial_plan": "", "user_image": None,
109
- "audio_cache": {} # Cache for generated audio
110
  }
111
 
112
  # ─────────────────────────────────────────────────────────────────────────────
@@ -121,10 +86,10 @@ def reset_state():
121
  "prompt_sent": False, "timer_running": {}, "last_tick": {},
122
  "project_title": "", "project_description": "", "upcycling_options": [],
123
  "plan_approved": False, "initial_plan": "", "user_image": None,
124
- "audio_cache": {}
125
  }
126
  st.success("βœ… Reset complete!")
127
- st.rerun()
128
 
129
  def send_text_request(model_name, prompt, image):
130
  """Helper to send requests that expect only a text response."""
@@ -152,7 +117,8 @@ def initial_analysis(uploaded_file, context_text):
152
  "Reply with ONLY the category name."
153
  )
154
  category = send_text_request(CATEGORY_MODEL, category_prompt, image)
155
- if not category: return
 
156
  st.session_state.app_state['category'] = category
157
 
158
  plan_prompt = f"""
@@ -171,7 +137,8 @@ def initial_analysis(uploaded_file, context_text):
171
  [Your plan or 3 options]
172
  """
173
  plan_response = send_text_request(GENERATION_MODEL, plan_prompt, image)
174
- if not plan_response: return
 
175
 
176
  try:
177
  st.session_state.app_state['project_title'] = re.search(r"TITLE:\s*(.*)", plan_response).group(1).strip()
@@ -198,7 +165,8 @@ def generate_detailed_guide_with_images(selected_option=None):
198
  """Generates the detailed guide with steps and illustrations."""
199
  image = st.session_state.app_state.get('user_image')
200
  if not image:
201
- st.error("Image not found. Please start over."); return
 
202
 
203
  context = f"The user has approved the plan for '{st.session_state.app_state['project_title']}'."
204
  if selected_option:
@@ -254,13 +222,16 @@ def generate_detailed_guide_with_images(selected_option=None):
254
  st.session_state.app_state['timers'][idx] = val * (60 if "minute" in unit else 1)
255
  else:
256
  st.session_state.app_state['timers'][idx] = 0
 
 
257
  except Exception as e:
258
  st.error(f"Failed to generate or parse the illustrated guide: {str(e)}")
259
 
260
  def render_sidebar_navigation():
261
  st.sidebar.markdown("## Steps Navigation")
262
  steps = st.session_state.app_state['steps']
263
- if not steps: return
 
264
  total_steps = len(steps)
265
  completed = sum(1 for done in st.session_state.app_state['done_flags'].values() if done)
266
  st.sidebar.progress(completed / total_steps if total_steps > 0 else 0)
@@ -270,45 +241,20 @@ def render_sidebar_navigation():
270
  label = f"{'βœ“' if is_done else 'Β·'} Step {idx}"
271
  if st.sidebar.button(label, key=f"nav_{idx}"):
272
  st.session_state.app_state['current_step'] = idx
273
- st.rerun()
274
 
275
  def render_tools_list():
276
  if st.session_state.app_state['tools_list']:
277
  with st.expander("πŸ”§ Required Tools & Materials", expanded=True):
278
- # Add narration button for tools list
279
- col1, col2 = st.columns([4, 1])
280
- with col1:
281
- for item in st.session_state.app_state['tools_list']:
282
- st.markdown(f"- {item}")
283
- with col2:
284
- if st.button("πŸ”Š Narrate Tools", key="narrate_tools"):
285
- tools_text = "Here are the required tools and materials: " + ", ".join(st.session_state.app_state['tools_list'])
286
- if 'tools_audio' not in st.session_state.app_state['audio_cache']:
287
- with st.spinner("Generating narration..."):
288
- st.session_state.app_state['audio_cache']['tools_audio'] = generate_speech(tools_text)
289
-
290
- if st.session_state.app_state['audio_cache']['tools_audio']:
291
- create_audio_player(st.session_state.app_state['audio_cache']['tools_audio'], "tools_player")
292
 
293
  def render_step(idx, text):
294
  total = len(st.session_state.app_state['steps'])
295
  st.markdown(f"### Step {idx} of {total}")
296
-
297
- # Add narration button for each step
298
- col1, col2 = st.columns([4, 1])
299
- with col1:
300
- st.write(text)
301
- with col2:
302
- if st.button("πŸ”Š Narrate", key=f"narrate_step_{idx}"):
303
- audio_key = f'step_{idx}_audio'
304
- if audio_key not in st.session_state.app_state['audio_cache']:
305
- with st.spinner("Generating narration..."):
306
- step_text = f"Step {idx}: {text}"
307
- st.session_state.app_state['audio_cache'][audio_key] = generate_speech(step_text)
308
-
309
- if st.session_state.app_state['audio_cache'][audio_key]:
310
- create_audio_player(st.session_state.app_state['audio_cache'][audio_key], f"step_{idx}_player")
311
 
 
312
  if idx in st.session_state.app_state['images']:
313
  st.image(
314
  st.session_state.app_state['images'][idx],
@@ -316,6 +262,35 @@ def render_step(idx, text):
316
  use_container_width=True
317
  )
318
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
  done = st.checkbox("βœ… Mark this step as completed", value=st.session_state.app_state['done_flags'].get(idx, False), key=f"done_{idx}")
320
  st.session_state.app_state['done_flags'][idx] = done
321
  notes = st.text_area("πŸ“ Your notes for this step:", value=st.session_state.app_state['notes'].get(idx, ""), height=100, key=f"notes_{idx}")
@@ -324,10 +299,10 @@ def render_step(idx, text):
324
  col1, col2, col3 = st.columns([1, 2, 1])
325
  if idx > 1 and col1.button("⬅️ Previous", key=f"prev_{idx}"):
326
  st.session_state.app_state['current_step'] -= 1
327
- st.rerun()
328
  if idx < total and col3.button("Next ➑️", key=f"next_{idx}"):
329
  st.session_state.app_state['current_step'] += 1
330
- st.rerun()
331
 
332
  # ─────────────────────────────────────────────────────────────────────────────
333
  # 4. APP LAYOUT
@@ -338,29 +313,29 @@ st.title("πŸ› οΈ NeoFix AI-Powered DIY Assistant")
338
 
339
  with st.expander("ℹ️ How it works", expanded=False):
340
  st.write("""
341
- 1. **Upload a photo** of your project or the item you want to fix or build (appliance, car part, plant, craft project).
342
- 2. **(Optional) Describe your goal** for more accurate results.
343
- 3. **Review the Plan.** The AI will propose a plan. If you didn't provide a description, you'll be asked to approve it.
344
- 4. **Get Your Guide** with tools and illustrated step-by-step instructions.
345
- 5. **Follow the Steps** using the interactive checklist with audio narration.
346
- """)
347
 
348
  if not st.session_state.app_state['prompt_sent']:
349
  st.markdown("---")
350
  col1, col2 = st.columns([3, 1])
351
  with col1:
352
  uploaded_image = st.file_uploader("πŸ“· Upload a photo of your project", type=["jpg", "jpeg", "png"])
353
- context_text = st.text_area("✏️ Describe the issue or your goal (optional but recommended)", height=80, placeholder="e.g., 'My toaster won't turn on,' or 'How do I build a desk like this?'")
354
  with col2:
355
  st.markdown("### Actions")
356
  if st.button("πŸš€ Get AI Guidance", type="primary", use_container_width=True):
357
  if uploaded_image:
358
  initial_analysis(uploaded_image, context_text)
359
- st.rerun()
360
  else:
361
  st.warning("⚠️ Please upload an image first!")
362
- if st.button("πŸ”„ Start Over", use_container_width=True):
363
- reset_state()
364
  else:
365
  render_sidebar_navigation()
366
  st.markdown("---")
@@ -375,14 +350,14 @@ else:
375
  for i, option in enumerate(st.session_state.app_state['upcycling_options']):
376
  if st.button(option, key=f"option_{i}"):
377
  generate_detailed_guide_with_images(selected_option=option)
378
- st.rerun()
379
  elif not st.session_state.app_state['plan_approved']:
380
  st.markdown("#### The AI has proposed the following plan:")
381
  st.success(st.session_state.app_state['initial_plan'])
382
  if st.button("βœ… Looks good, proceed with this plan", type="primary"):
383
  st.session_state.app_state['plan_approved'] = True
384
  generate_detailed_guide_with_images()
385
- st.rerun()
386
  else:
387
  render_tools_list()
388
  st.markdown("---")
@@ -392,7 +367,7 @@ else:
392
  render_step(step_num, step_text)
393
  except IndexError:
394
  st.session_state.app_state['current_step'] = 1
395
- st.rerun()
396
 
397
  total_steps = len(st.session_state.app_state['steps'])
398
  done_count = sum(1 for d in st.session_state.app_state['done_flags'].values() if d)
 
6
  import re
7
  import time
8
  import os
9
+ import io
10
  import wave
 
11
 
12
  # Disable Streamlit analytics (prevents PermissionError in some environments)
13
  os.environ["STREAMLIT_ANALYTICS_ENABLED"] = "false"
 
36
  # 1.3 Constants (model IDs, exactly as in original code)
37
  CATEGORY_MODEL = "gemini-2.0-flash-exp"
38
  GENERATION_MODEL = "gemini-2.0-flash-exp-image-generation"
39
+ TTS_MODEL = "gemini-2.5-flash-preview-tts"
40
+ VOICE_NAME = "Kore"
41
 
42
  # 1.4 Helper to parse numbered steps out of Gemini text
43
  def parse_numbered_steps(text):
 
50
  steps = re.findall(r"\n\s*(\d+)\.\s*(.*)", text, re.MULTILINE)
51
  return [(int(num), desc.strip()) for num, desc in steps]
52
 
53
+ # 1.5 Helper to convert raw PCM into WAV bytes (for in-memory playback)
54
+ def tts_wav_bytes(pcm, channels=1, rate=24000, sample_width=2):
55
+ buf = io.BytesIO()
56
+ with wave.open(buf, "wb") as wf:
57
  wf.setnchannels(channels)
58
  wf.setsampwidth(sample_width)
59
  wf.setframerate(rate)
60
  wf.writeframes(pcm)
61
+ return buf.getvalue()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  # ─────────────────────────────────────────────────────────────────────────────
64
  # 2. SESSION STATE SETUP
 
71
  "prompt_sent": False, "timer_running": {}, "last_tick": {},
72
  "project_title": "", "project_description": "", "upcycling_options": [],
73
  "plan_approved": False, "initial_plan": "", "user_image": None,
74
+ "tts": {} # store TTS WAV bytes per step index
75
  }
76
 
77
  # ─────────────────────────────────────────────────────────────────────────────
 
86
  "prompt_sent": False, "timer_running": {}, "last_tick": {},
87
  "project_title": "", "project_description": "", "upcycling_options": [],
88
  "plan_approved": False, "initial_plan": "", "user_image": None,
89
+ "tts": {}
90
  }
91
  st.success("βœ… Reset complete!")
92
+ st.experimental_rerun()
93
 
94
  def send_text_request(model_name, prompt, image):
95
  """Helper to send requests that expect only a text response."""
 
117
  "Reply with ONLY the category name."
118
  )
119
  category = send_text_request(CATEGORY_MODEL, category_prompt, image)
120
+ if not category:
121
+ return
122
  st.session_state.app_state['category'] = category
123
 
124
  plan_prompt = f"""
 
137
  [Your plan or 3 options]
138
  """
139
  plan_response = send_text_request(GENERATION_MODEL, plan_prompt, image)
140
+ if not plan_response:
141
+ return
142
 
143
  try:
144
  st.session_state.app_state['project_title'] = re.search(r"TITLE:\s*(.*)", plan_response).group(1).strip()
 
165
  """Generates the detailed guide with steps and illustrations."""
166
  image = st.session_state.app_state.get('user_image')
167
  if not image:
168
+ st.error("Image not found. Please start over.")
169
+ return
170
 
171
  context = f"The user has approved the plan for '{st.session_state.app_state['project_title']}'."
172
  if selected_option:
 
222
  st.session_state.app_state['timers'][idx] = val * (60 if "minute" in unit else 1)
223
  else:
224
  st.session_state.app_state['timers'][idx] = 0
225
+ # Initialize empty TTS slot (will be generated on demand)
226
+ st.session_state.app_state['tts'][idx] = None
227
  except Exception as e:
228
  st.error(f"Failed to generate or parse the illustrated guide: {str(e)}")
229
 
230
  def render_sidebar_navigation():
231
  st.sidebar.markdown("## Steps Navigation")
232
  steps = st.session_state.app_state['steps']
233
+ if not steps:
234
+ return
235
  total_steps = len(steps)
236
  completed = sum(1 for done in st.session_state.app_state['done_flags'].values() if done)
237
  st.sidebar.progress(completed / total_steps if total_steps > 0 else 0)
 
241
  label = f"{'βœ“' if is_done else 'Β·'} Step {idx}"
242
  if st.sidebar.button(label, key=f"nav_{idx}"):
243
  st.session_state.app_state['current_step'] = idx
244
+ st.experimental_rerun()
245
 
246
  def render_tools_list():
247
  if st.session_state.app_state['tools_list']:
248
  with st.expander("πŸ”§ Required Tools & Materials", expanded=True):
249
+ for item in st.session_state.app_state['tools_list']:
250
+ st.markdown(f"- {item}")
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
  def render_step(idx, text):
253
  total = len(st.session_state.app_state['steps'])
254
  st.markdown(f"### Step {idx} of {total}")
255
+ st.write(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
+ # Display illustrative image if available
258
  if idx in st.session_state.app_state['images']:
259
  st.image(
260
  st.session_state.app_state['images'][idx],
 
262
  use_container_width=True
263
  )
264
 
265
+ # TTS generation and playback
266
+ # If we haven't generated TTS for this step yet, do it now
267
+ if st.session_state.app_state['tts'].get(idx) is None:
268
+ try:
269
+ tts_response = client.models.generate_content(
270
+ model=TTS_MODEL,
271
+ contents=text,
272
+ config=types.GenerateContentConfig(
273
+ response_modalities=["AUDIO"],
274
+ speech_config=types.SpeechConfig(
275
+ voice_config=types.VoiceConfig(
276
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(
277
+ voice_name=VOICE_NAME,
278
+ )
279
+ )
280
+ ),
281
+ )
282
+ )
283
+ pcm_data = tts_response.candidates[0].content.parts[0].inline_data.data
284
+ wav_bytes = tts_wav_bytes(pcm_data)
285
+ st.session_state.app_state['tts'][idx] = wav_bytes
286
+ except Exception as e:
287
+ st.error(f"Failed to generate TTS for step {idx}: {e}")
288
+
289
+ # If WAV bytes are available, show a play button
290
+ if st.session_state.app_state['tts'].get(idx):
291
+ st.audio(st.session_state.app_state['tts'][idx], format="audio/wav")
292
+
293
+ # Checkbox and notes
294
  done = st.checkbox("βœ… Mark this step as completed", value=st.session_state.app_state['done_flags'].get(idx, False), key=f"done_{idx}")
295
  st.session_state.app_state['done_flags'][idx] = done
296
  notes = st.text_area("πŸ“ Your notes for this step:", value=st.session_state.app_state['notes'].get(idx, ""), height=100, key=f"notes_{idx}")
 
299
  col1, col2, col3 = st.columns([1, 2, 1])
300
  if idx > 1 and col1.button("⬅️ Previous", key=f"prev_{idx}"):
301
  st.session_state.app_state['current_step'] -= 1
302
+ st.experimental_rerun()
303
  if idx < total and col3.button("Next ➑️", key=f"next_{idx}"):
304
  st.session_state.app_state['current_step'] += 1
305
+ st.experimental_rerun()
306
 
307
  # ─────────────────────────────────────────────────────────────────────────────
308
  # 4. APP LAYOUT
 
313
 
314
  with st.expander("ℹ️ How it works", expanded=False):
315
  st.write("""
316
+ 1. **Upload a photo** of your project or the item you want to fix or build (appliance, car part, plant, craft project).
317
+ 2. **(Optional) Describe your goal** for more accurate results.
318
+ 3. **Review the Plan.** The AI will propose a plan. If you didn’t provide a description, you’ll be asked to approve it.
319
+ 4. **Get Your Guide** with tools and illustrated step-by-step instructions.
320
+ 5. **Follow the Steps** using the interactive checklist (with audio narration for each step).
321
+ """)
322
 
323
  if not st.session_state.app_state['prompt_sent']:
324
  st.markdown("---")
325
  col1, col2 = st.columns([3, 1])
326
  with col1:
327
  uploaded_image = st.file_uploader("πŸ“· Upload a photo of your project", type=["jpg", "jpeg", "png"])
328
+ context_text = st.text_area("✏️ Describe the issue or your goal (optional but recommended)", height=80, placeholder="e.g., β€˜My toaster won’t turn on,’ or β€˜How do I build a desk like this?’")
329
  with col2:
330
  st.markdown("### Actions")
331
  if st.button("πŸš€ Get AI Guidance", type="primary", use_container_width=True):
332
  if uploaded_image:
333
  initial_analysis(uploaded_image, context_text)
334
+ st.experimental_rerun()
335
  else:
336
  st.warning("⚠️ Please upload an image first!")
337
+ if st.button("πŸ”„ Start Over", use_container_width=True):
338
+ reset_state()
339
  else:
340
  render_sidebar_navigation()
341
  st.markdown("---")
 
350
  for i, option in enumerate(st.session_state.app_state['upcycling_options']):
351
  if st.button(option, key=f"option_{i}"):
352
  generate_detailed_guide_with_images(selected_option=option)
353
+ st.experimental_rerun()
354
  elif not st.session_state.app_state['plan_approved']:
355
  st.markdown("#### The AI has proposed the following plan:")
356
  st.success(st.session_state.app_state['initial_plan'])
357
  if st.button("βœ… Looks good, proceed with this plan", type="primary"):
358
  st.session_state.app_state['plan_approved'] = True
359
  generate_detailed_guide_with_images()
360
+ st.experimental_rerun()
361
  else:
362
  render_tools_list()
363
  st.markdown("---")
 
367
  render_step(step_num, step_text)
368
  except IndexError:
369
  st.session_state.app_state['current_step'] = 1
370
+ st.experimental_rerun()
371
 
372
  total_steps = len(st.session_state.app_state['steps'])
373
  done_count = sum(1 for d in st.session_state.app_state['done_flags'].values() if d)