Spaces:

saa231
/

MutimodalVisionAssistant

Paused

saa231 commited on Apr 27, 2025

Commit

8435f69

verified ·

1 Parent(s): 903dad9

Update project_model.py

Files changed (1) hide show

project_model.py CHANGED Viewed

@@ -193,42 +193,43 @@ def process_inputs(
         # Reset session with the current image and visual context
         session.reset(image, annotated_image, visual_context)
     if audio_path:
         audio_text = whisper_pipe(audio_path)["text"]
-        question += " " + audio_text
     session.add_question(question)
-    # Send prompt to Gemma 3
-    gemma_output = gemma_pipe(text=session.message_history, max_new_tokens=500)
-    # Debugging: Check the output structure
-    print("Gemma Output:", gemma_output)
-    print("Type of Gemma Output:", type(gemma_output))
-    # Make sure the output is in the expected format (a string)
     if isinstance(gemma_output, list) and len(gemma_output) > 0:
         gemma_text = gemma_output[0]["generated_text"][-1]["content"]
         if isinstance(gemma_text, str):
             answer = gemma_text
         else:
-            # Handle unexpected formats or empty text
             answer = "No valid generated text found."
     else:
-        answer = "No valid output from Gemma model"
-    # answer = gemma_output[0]["generated_text"][-1]["content"]
-    # Append GEMMA's response to the history to maintain alternating structure
     session.add_answer(answer)
-    # If TTS is enabled, synthesize answer as speech
     output_audio_path = "response.wav"
     if enable_tts:
         tts.tts_to_file(text=answer, file_path=output_audio_path)
     else:
         output_audio_path = None
-    return answer, output_audio_path

         # Reset session with the current image and visual context
         session.reset(image, annotated_image, visual_context)
     if audio_path:
         audio_text = whisper_pipe(audio_path)["text"]
+        question += " " + audio_text.strip()
+    # Add user's new question to the history
     session.add_question(question)
+    # Sends current image and current question to Gemma 3
+    gemma_output = gemma_pipe(
+        image=session.current_image,
+        question=question,
+        max_new_tokens=500
+    )
+    # Debugging (optional)
+    #print("Gemma Output:", gemma_output)
+    # Handle output format safely
     if isinstance(gemma_output, list) and len(gemma_output) > 0:
         gemma_text = gemma_output[0]["generated_text"][-1]["content"]
         if isinstance(gemma_text, str):
             answer = gemma_text
         else:
             answer = "No valid generated text found."
     else:
+        answer = "No valid output from Gemma model."
+    # Save assistant's answer into session history
     session.add_answer(answer)
+    # Text-to-speech output
     output_audio_path = "response.wav"
     if enable_tts:
         tts.tts_to_file(text=answer, file_path=output_audio_path)
     else:
         output_audio_path = None
+    return answer, output_audio_path