Spaces:

saa231
/

MutimodalVisionAssistant

Paused

saa231 commited on Apr 22, 2025

Commit

fc25a0b

verified ·

1 Parent(s): 3103aa5

Update project_model.py

Files changed (1) hide show

project_model.py CHANGED Viewed

@@ -77,11 +77,22 @@ class VisualQAState:
     def add_question(self, question: str):
         """
-        Adds a follow-up text message to the chat.
         """
         self.message_history.append({
-            "role": "user",
-            "content": [{"type": "text", "text": question}]
         })
 # -------------------------------
@@ -171,13 +182,16 @@ def process_inputs(
         audio_text = whisper_pipe(audio_path)["text"]
         question += " " + audio_text
-    # Append question to conversation history
     session.add_question(question)
     # Generate response using GEMMA with full conversation history
     gemma_output = gemma_pipe(text=session.message_history, max_new_tokens=200)
     answer = gemma_output[0]["generated_text"][-1]["content"]
     # If TTS is enabled, synthesize answer as speech
     output_audio_path = "response.wav"
     if enable_tts:
@@ -185,4 +199,4 @@ def process_inputs(
     else:
         output_audio_path = None
-    return answer, output_audio_path

     def add_question(self, question: str):
         """
+        Adds a follow-up question only if the last message was from assistant.
+        Ensures alternating user/assistant messages.
+        """
+        if not self.message_history or self.message_history[-1]["role"] == "assistant":
+            self.message_history.append({
+                "role": "user",
+                "content": [{"type": "text", "text": question}]
+            })
+    def add_answer(self, answer: str):
+        """
+        Appends the assistant's response to the conversation history.
         """
         self.message_history.append({
+            "role": "assistant",
+            "content": [{"type": "text", "text": answer}]
         })
 # -------------------------------
         audio_text = whisper_pipe(audio_path)["text"]
         question += " " + audio_text
+    # Append question to conversation history (only if alternating correctly)
     session.add_question(question)
     # Generate response using GEMMA with full conversation history
     gemma_output = gemma_pipe(text=session.message_history, max_new_tokens=200)
     answer = gemma_output[0]["generated_text"][-1]["content"]
+    # Append GEMMA's response to the history to maintain alternating structure
+    session.add_answer(answer)
     # If TTS is enabled, synthesize answer as speech
     output_audio_path = "response.wav"
     if enable_tts:
     else:
         output_audio_path = None
+    return answer, output_audio_path