NeoTest

Sleeping

App Files Files Community

rairo commited on Jun 6, 2025

Commit

57aa416

verified ·

1 Parent(s): b65dc51

Narration with Gemini 2

Browse files

Files changed (1) hide show

app.py +70 -95

app.py CHANGED Viewed

@@ -6,8 +6,8 @@ from google.genai import types
 import re
 import time
 import os
 import wave
-import base64
 # Disable Streamlit analytics (prevents PermissionError in some environments)
 os.environ["STREAMLIT_ANALYTICS_ENABLED"] = "false"
@@ -36,7 +36,8 @@ except Exception as e:
 # 1.3 Constants (model IDs, exactly as in original code)
 CATEGORY_MODEL   = "gemini-2.0-flash-exp"
 GENERATION_MODEL = "gemini-2.0-flash-exp-image-generation"
-TTS_MODEL = "gemini-2.5-flash-preview-tts"
 # 1.4 Helper to parse numbered steps out of Gemini text
 def parse_numbered_steps(text):
@@ -49,51 +50,15 @@ def parse_numbered_steps(text):
     steps = re.findall(r"\n\s*(\d+)\.\s*(.*)", text, re.MULTILINE)
     return [(int(num), desc.strip()) for num, desc in steps]
-# 1.5 TTS Helper Functions
-def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
-    """Create a wave file from PCM data"""
-    with wave.open(filename, "wb") as wf:
         wf.setnchannels(channels)
         wf.setsampwidth(sample_width)
         wf.setframerate(rate)
         wf.writeframes(pcm)
-def generate_speech(text, voice_name='Kore'):
-    """Generate speech from text using Gemini TTS"""
-    try:
-        response = client.models.generate_content(
-            model=TTS_MODEL,
-            contents=f"Say in a clear, helpful tone: {text}",
-            config=types.GenerateContentConfig(
-                response_modalities=["AUDIO"],
-                speech_config=types.SpeechConfig(
-                    voice_config=types.VoiceConfig(
-                        prebuilt_voice_config=types.PrebuiltVoiceConfig(
-                            voice_name=voice_name,
-                        )
-                    )
-                ),
-            )
-        )
-        audio_data = response.candidates[0].content.parts[0].inline_data.data
-        return audio_data
-    except Exception as e:
-        st.error(f"TTS generation failed: {str(e)}")
-        return None
-def create_audio_player(audio_data, key):
-    """Create an audio player widget for the generated speech"""
-    if audio_data:
-        # Convert audio data to base64 for HTML audio player
-        audio_b64 = base64.b64encode(audio_data).decode()
-        audio_html = f"""
-        <audio controls style="width: 100%;">
-            <source src="data:audio/wav;base64,{audio_b64}" type="audio/wav">
-            Your browser does not support the audio element.
-        </audio>
-        """
-        st.markdown(audio_html, unsafe_allow_html=True)
 # ─────────────────────────────────────────────────────────────────────────────
 # 2. SESSION STATE SETUP
@@ -106,7 +71,7 @@ if "app_state" not in st.session_state:
         "prompt_sent": False, "timer_running": {}, "last_tick": {},
         "project_title": "", "project_description": "", "upcycling_options": [],
         "plan_approved": False, "initial_plan": "", "user_image": None,
-        "audio_cache": {}  # Cache for generated audio
     }
 # ─────────────────────────────────────────────────────────────────────────────
@@ -121,10 +86,10 @@ def reset_state():
         "prompt_sent": False, "timer_running": {}, "last_tick": {},
         "project_title": "", "project_description": "", "upcycling_options": [],
         "plan_approved": False, "initial_plan": "", "user_image": None,
-        "audio_cache": {}
     }
     st.success("✅ Reset complete!")
-    st.rerun()
 def send_text_request(model_name, prompt, image):
     """Helper to send requests that expect only a text response."""
@@ -152,7 +117,8 @@ def initial_analysis(uploaded_file, context_text):
             "Reply with ONLY the category name."
         )
         category = send_text_request(CATEGORY_MODEL, category_prompt, image)
-        if not category: return
         st.session_state.app_state['category'] = category
         plan_prompt = f"""
@@ -171,7 +137,8 @@ def initial_analysis(uploaded_file, context_text):
         [Your plan or 3 options]
         """
         plan_response = send_text_request(GENERATION_MODEL, plan_prompt, image)
-        if not plan_response: return
     try:
         st.session_state.app_state['project_title'] = re.search(r"TITLE:\s*(.*)", plan_response).group(1).strip()
@@ -198,7 +165,8 @@ def generate_detailed_guide_with_images(selected_option=None):
     """Generates the detailed guide with steps and illustrations."""
     image = st.session_state.app_state.get('user_image')
     if not image:
-        st.error("Image not found. Please start over."); return
     context = f"The user has approved the plan for '{st.session_state.app_state['project_title']}'."
     if selected_option:
@@ -254,13 +222,16 @@ def generate_detailed_guide_with_images(selected_option=None):
                     st.session_state.app_state['timers'][idx] = val * (60 if "minute" in unit else 1)
                 else:
                     st.session_state.app_state['timers'][idx] = 0
         except Exception as e:
             st.error(f"Failed to generate or parse the illustrated guide: {str(e)}")
 def render_sidebar_navigation():
     st.sidebar.markdown("## Steps Navigation")
     steps = st.session_state.app_state['steps']
-    if not steps: return
     total_steps = len(steps)
     completed = sum(1 for done in st.session_state.app_state['done_flags'].values() if done)
     st.sidebar.progress(completed / total_steps if total_steps > 0 else 0)
@@ -270,45 +241,20 @@ def render_sidebar_navigation():
         label = f"{'✓' if is_done else '·'} Step {idx}"
         if st.sidebar.button(label, key=f"nav_{idx}"):
             st.session_state.app_state['current_step'] = idx
-            st.rerun()
 def render_tools_list():
     if st.session_state.app_state['tools_list']:
         with st.expander("🔧 Required Tools & Materials", expanded=True):
-            # Add narration button for tools list
-            col1, col2 = st.columns([4, 1])
-            with col1:
-                for item in st.session_state.app_state['tools_list']:
-                    st.markdown(f"- {item}")
-            with col2:
-                if st.button("🔊 Narrate Tools", key="narrate_tools"):
-                    tools_text = "Here are the required tools and materials: " + ", ".join(st.session_state.app_state['tools_list'])
-                    if 'tools_audio' not in st.session_state.app_state['audio_cache']:
-                        with st.spinner("Generating narration..."):
-                            st.session_state.app_state['audio_cache']['tools_audio'] = generate_speech(tools_text)
-                    if st.session_state.app_state['audio_cache']['tools_audio']:
-                        create_audio_player(st.session_state.app_state['audio_cache']['tools_audio'], "tools_player")
 def render_step(idx, text):
     total = len(st.session_state.app_state['steps'])
     st.markdown(f"### Step {idx} of {total}")
-    # Add narration button for each step
-    col1, col2 = st.columns([4, 1])
-    with col1:
-        st.write(text)
-    with col2:
-        if st.button("🔊 Narrate", key=f"narrate_step_{idx}"):
-            audio_key = f'step_{idx}_audio'
-            if audio_key not in st.session_state.app_state['audio_cache']:
-                with st.spinner("Generating narration..."):
-                    step_text = f"Step {idx}: {text}"
-                    st.session_state.app_state['audio_cache'][audio_key] = generate_speech(step_text)
-            if st.session_state.app_state['audio_cache'][audio_key]:
-                create_audio_player(st.session_state.app_state['audio_cache'][audio_key], f"step_{idx}_player")
     if idx in st.session_state.app_state['images']:
         st.image(
             st.session_state.app_state['images'][idx],
@@ -316,6 +262,35 @@ def render_step(idx, text):
             use_container_width=True
         )
     done = st.checkbox("✅ Mark this step as completed", value=st.session_state.app_state['done_flags'].get(idx, False), key=f"done_{idx}")
     st.session_state.app_state['done_flags'][idx] = done
     notes = st.text_area("📝 Your notes for this step:", value=st.session_state.app_state['notes'].get(idx, ""), height=100, key=f"notes_{idx}")
@@ -324,10 +299,10 @@ def render_step(idx, text):
     col1, col2, col3 = st.columns([1, 2, 1])
     if idx > 1 and col1.button("⬅️ Previous", key=f"prev_{idx}"):
         st.session_state.app_state['current_step'] -= 1
-        st.rerun()
     if idx < total and col3.button("Next ➡️", key=f"next_{idx}"):
         st.session_state.app_state['current_step'] += 1
-        st.rerun()
 # ─────────────────────────────────────────────────────────────────────────────
 # 4. APP LAYOUT
@@ -338,29 +313,29 @@ st.title("🛠️ NeoFix AI-Powered DIY Assistant")
 with st.expander("ℹ️ How it works", expanded=False):
     st.write("""
-    1.  **Upload a photo** of your project or the item you want to fix or build (appliance, car part, plant, craft project).
-    2.  **(Optional) Describe your goal** for more accurate results.
-    3.  **Review the Plan.** The AI will propose a plan. If you didn't provide a description, you'll be asked to approve it.
-    4.  **Get Your Guide** with tools and illustrated step-by-step instructions.
-    5.  **Follow the Steps** using the interactive checklist with audio narration.
-    """)
 if not st.session_state.app_state['prompt_sent']:
     st.markdown("---")
     col1, col2 = st.columns([3, 1])
     with col1:
         uploaded_image = st.file_uploader("📷 Upload a photo of your project", type=["jpg", "jpeg", "png"])
-        context_text = st.text_area("✏️ Describe the issue or your goal (optional but recommended)", height=80, placeholder="e.g., 'My toaster won't turn on,' or 'How do I build a desk like this?'")
     with col2:
         st.markdown("### Actions")
         if st.button("🚀 Get AI Guidance", type="primary", use_container_width=True):
             if uploaded_image:
                 initial_analysis(uploaded_image, context_text)
-                st.rerun()
             else:
                 st.warning("⚠️ Please upload an image first!")
-    if st.button("🔄 Start Over", use_container_width=True):
-        reset_state()
 else:
     render_sidebar_navigation()
     st.markdown("---")
@@ -375,14 +350,14 @@ else:
             for i, option in enumerate(st.session_state.app_state['upcycling_options']):
                 if st.button(option, key=f"option_{i}"):
                     generate_detailed_guide_with_images(selected_option=option)
-                    st.rerun()
         elif not st.session_state.app_state['plan_approved']:
             st.markdown("#### The AI has proposed the following plan:")
             st.success(st.session_state.app_state['initial_plan'])
             if st.button("✅ Looks good, proceed with this plan", type="primary"):
                 st.session_state.app_state['plan_approved'] = True
                 generate_detailed_guide_with_images()
-                st.rerun()
     else:
         render_tools_list()
         st.markdown("---")
@@ -392,7 +367,7 @@ else:
             render_step(step_num, step_text)
         except IndexError:
             st.session_state.app_state['current_step'] = 1
-            st.rerun()
         total_steps = len(st.session_state.app_state['steps'])
         done_count = sum(1 for d in st.session_state.app_state['done_flags'].values() if d)

 import re
 import time
 import os
+import io
 import wave
 # Disable Streamlit analytics (prevents PermissionError in some environments)
 os.environ["STREAMLIT_ANALYTICS_ENABLED"] = "false"
 # 1.3 Constants (model IDs, exactly as in original code)
 CATEGORY_MODEL   = "gemini-2.0-flash-exp"
 GENERATION_MODEL = "gemini-2.0-flash-exp-image-generation"
+TTS_MODEL        = "gemini-2.5-flash-preview-tts"
+VOICE_NAME       = "Kore"
 # 1.4 Helper to parse numbered steps out of Gemini text
 def parse_numbered_steps(text):
     steps = re.findall(r"\n\s*(\d+)\.\s*(.*)", text, re.MULTILINE)
     return [(int(num), desc.strip()) for num, desc in steps]
+# 1.5 Helper to convert raw PCM into WAV bytes (for in-memory playback)
+def tts_wav_bytes(pcm, channels=1, rate=24000, sample_width=2):
+    buf = io.BytesIO()
+    with wave.open(buf, "wb") as wf:
         wf.setnchannels(channels)
         wf.setsampwidth(sample_width)
         wf.setframerate(rate)
         wf.writeframes(pcm)
+    return buf.getvalue()
 # ─────────────────────────────────────────────────────────────────────────────
 # 2. SESSION STATE SETUP
         "prompt_sent": False, "timer_running": {}, "last_tick": {},
         "project_title": "", "project_description": "", "upcycling_options": [],
         "plan_approved": False, "initial_plan": "", "user_image": None,
+        "tts": {}  # store TTS WAV bytes per step index
     }
 # ─────────────────────────────────────────────────────────────────────────────
         "prompt_sent": False, "timer_running": {}, "last_tick": {},
         "project_title": "", "project_description": "", "upcycling_options": [],
         "plan_approved": False, "initial_plan": "", "user_image": None,
+        "tts": {}
     }
     st.success("✅ Reset complete!")
+    st.experimental_rerun()
 def send_text_request(model_name, prompt, image):
     """Helper to send requests that expect only a text response."""
             "Reply with ONLY the category name."
         )
         category = send_text_request(CATEGORY_MODEL, category_prompt, image)
+        if not category:
+            return
         st.session_state.app_state['category'] = category
         plan_prompt = f"""
         [Your plan or 3 options]
         """
         plan_response = send_text_request(GENERATION_MODEL, plan_prompt, image)
+        if not plan_response:
+            return
     try:
         st.session_state.app_state['project_title'] = re.search(r"TITLE:\s*(.*)", plan_response).group(1).strip()
     """Generates the detailed guide with steps and illustrations."""
     image = st.session_state.app_state.get('user_image')
     if not image:
+        st.error("Image not found. Please start over.")
+        return
     context = f"The user has approved the plan for '{st.session_state.app_state['project_title']}'."
     if selected_option:
                     st.session_state.app_state['timers'][idx] = val * (60 if "minute" in unit else 1)
                 else:
                     st.session_state.app_state['timers'][idx] = 0
+                # Initialize empty TTS slot (will be generated on demand)
+                st.session_state.app_state['tts'][idx] = None
         except Exception as e:
             st.error(f"Failed to generate or parse the illustrated guide: {str(e)}")
 def render_sidebar_navigation():
     st.sidebar.markdown("## Steps Navigation")
     steps = st.session_state.app_state['steps']
+    if not steps:
+        return
     total_steps = len(steps)
     completed = sum(1 for done in st.session_state.app_state['done_flags'].values() if done)
     st.sidebar.progress(completed / total_steps if total_steps > 0 else 0)
         label = f"{'✓' if is_done else '·'} Step {idx}"
         if st.sidebar.button(label, key=f"nav_{idx}"):
             st.session_state.app_state['current_step'] = idx
+            st.experimental_rerun()
 def render_tools_list():
     if st.session_state.app_state['tools_list']:
         with st.expander("🔧 Required Tools & Materials", expanded=True):
+            for item in st.session_state.app_state['tools_list']:
+                st.markdown(f"- {item}")
 def render_step(idx, text):
     total = len(st.session_state.app_state['steps'])
     st.markdown(f"### Step {idx} of {total}")
+    st.write(text)
+    # Display illustrative image if available
     if idx in st.session_state.app_state['images']:
         st.image(
             st.session_state.app_state['images'][idx],
             use_container_width=True
         )
+    # TTS generation and playback
+    # If we haven't generated TTS for this step yet, do it now
+    if st.session_state.app_state['tts'].get(idx) is None:
+        try:
+            tts_response = client.models.generate_content(
+                model=TTS_MODEL,
+                contents=text,
+                config=types.GenerateContentConfig(
+                    response_modalities=["AUDIO"],
+                    speech_config=types.SpeechConfig(
+                        voice_config=types.VoiceConfig(
+                            prebuilt_voice_config=types.PrebuiltVoiceConfig(
+                                voice_name=VOICE_NAME,
+                            )
+                        )
+                    ),
+                )
+            )
+            pcm_data = tts_response.candidates[0].content.parts[0].inline_data.data
+            wav_bytes = tts_wav_bytes(pcm_data)
+            st.session_state.app_state['tts'][idx] = wav_bytes
+        except Exception as e:
+            st.error(f"Failed to generate TTS for step {idx}: {e}")
+    # If WAV bytes are available, show a play button
+    if st.session_state.app_state['tts'].get(idx):
+        st.audio(st.session_state.app_state['tts'][idx], format="audio/wav")
+    # Checkbox and notes
     done = st.checkbox("✅ Mark this step as completed", value=st.session_state.app_state['done_flags'].get(idx, False), key=f"done_{idx}")
     st.session_state.app_state['done_flags'][idx] = done
     notes = st.text_area("📝 Your notes for this step:", value=st.session_state.app_state['notes'].get(idx, ""), height=100, key=f"notes_{idx}")
     col1, col2, col3 = st.columns([1, 2, 1])
     if idx > 1 and col1.button("⬅️ Previous", key=f"prev_{idx}"):
         st.session_state.app_state['current_step'] -= 1
+        st.experimental_rerun()
     if idx < total and col3.button("Next ➡️", key=f"next_{idx}"):
         st.session_state.app_state['current_step'] += 1
+        st.experimental_rerun()
 # ─────────────────────────────────────────────────────────────────────────────
 # 4. APP LAYOUT
 with st.expander("ℹ️ How it works", expanded=False):
     st.write("""
+1.  **Upload a photo** of your project or the item you want to fix or build (appliance, car part, plant, craft project).
+2.  **(Optional) Describe your goal** for more accurate results.
+3.  **Review the Plan.** The AI will propose a plan. If you didn’t provide a description, you’ll be asked to approve it.
+4.  **Get Your Guide** with tools and illustrated step-by-step instructions.
+5.  **Follow the Steps** using the interactive checklist (with audio narration for each step).
+""")
 if not st.session_state.app_state['prompt_sent']:
     st.markdown("---")
     col1, col2 = st.columns([3, 1])
     with col1:
         uploaded_image = st.file_uploader("📷 Upload a photo of your project", type=["jpg", "jpeg", "png"])
+        context_text = st.text_area("✏️ Describe the issue or your goal (optional but recommended)", height=80, placeholder="e.g., ‘My toaster won’t turn on,’ or ‘How do I build a desk like this?’")
     with col2:
         st.markdown("### Actions")
         if st.button("🚀 Get AI Guidance", type="primary", use_container_width=True):
             if uploaded_image:
                 initial_analysis(uploaded_image, context_text)
+                st.experimental_rerun()
             else:
                 st.warning("⚠️ Please upload an image first!")
+        if st.button("🔄 Start Over", use_container_width=True):
+            reset_state()
 else:
     render_sidebar_navigation()
     st.markdown("---")
             for i, option in enumerate(st.session_state.app_state['upcycling_options']):
                 if st.button(option, key=f"option_{i}"):
                     generate_detailed_guide_with_images(selected_option=option)
+                    st.experimental_rerun()
         elif not st.session_state.app_state['plan_approved']:
             st.markdown("#### The AI has proposed the following plan:")
             st.success(st.session_state.app_state['initial_plan'])
             if st.button("✅ Looks good, proceed with this plan", type="primary"):
                 st.session_state.app_state['plan_approved'] = True
                 generate_detailed_guide_with_images()
+                st.experimental_rerun()
     else:
         render_tools_list()
         st.markdown("---")
             render_step(step_num, step_text)
         except IndexError:
             st.session_state.app_state['current_step'] = 1
+            st.experimental_rerun()
         total_steps = len(st.session_state.app_state['steps'])
         done_count = sum(1 for d in st.session_state.app_state['done_flags'].values() if d)