Spaces:

saa231
/

MutimodalVisionAssistant

Paused

saa231 commited on Apr 27, 2025

Commit

727b8c6

verified ·

1 Parent(s): 8435f69

Update project_model.py

Files changed (1) hide show

project_model.py CHANGED Viewed

@@ -196,7 +196,7 @@ def process_inputs(
     if audio_path:
         audio_text = whisper_pipe(audio_path)["text"]
-        question += " " + audio_text.strip()
     # Add user's new question to the history
     session.add_question(question)
@@ -204,7 +204,7 @@ def process_inputs(
     # Sends current image and current question to Gemma 3
     gemma_output = gemma_pipe(
-        image=session.current_image,
         question=question,
         max_new_tokens=500
     )

     if audio_path:
         audio_text = whisper_pipe(audio_path)["text"]
+        question += 'You are a helpful visual assistant designed for visually impaired users that assists users by answering the following question. If unsure, say "I am not certain."' + audio_text.strip()
     # Add user's new question to the history
     session.add_question(question)
     # Sends current image and current question to Gemma 3
     gemma_output = gemma_pipe(
+        image=session.annotated_image,
         question=question,
         max_new_tokens=500
     )