Update project_model.py
Browse files- project_model.py +16 -15
project_model.py
CHANGED
|
@@ -193,42 +193,43 @@ def process_inputs(
|
|
| 193 |
|
| 194 |
# Reset session with the current image and visual context
|
| 195 |
session.reset(image, annotated_image, visual_context)
|
| 196 |
-
|
| 197 |
|
| 198 |
if audio_path:
|
| 199 |
audio_text = whisper_pipe(audio_path)["text"]
|
| 200 |
-
question += " " + audio_text
|
| 201 |
|
|
|
|
| 202 |
session.add_question(question)
|
| 203 |
|
| 204 |
-
# Send prompt to Gemma 3
|
| 205 |
-
gemma_output = gemma_pipe(text=session.message_history, max_new_tokens=500)
|
| 206 |
|
| 207 |
-
#
|
| 208 |
-
|
| 209 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
|
| 211 |
-
#
|
| 212 |
if isinstance(gemma_output, list) and len(gemma_output) > 0:
|
| 213 |
gemma_text = gemma_output[0]["generated_text"][-1]["content"]
|
| 214 |
if isinstance(gemma_text, str):
|
| 215 |
answer = gemma_text
|
| 216 |
else:
|
| 217 |
-
# Handle unexpected formats or empty text
|
| 218 |
answer = "No valid generated text found."
|
| 219 |
else:
|
| 220 |
-
answer = "No valid output from Gemma model"
|
| 221 |
-
|
| 222 |
-
# answer = gemma_output[0]["generated_text"][-1]["content"]
|
| 223 |
|
| 224 |
-
#
|
| 225 |
session.add_answer(answer)
|
| 226 |
|
| 227 |
-
#
|
| 228 |
output_audio_path = "response.wav"
|
| 229 |
if enable_tts:
|
| 230 |
tts.tts_to_file(text=answer, file_path=output_audio_path)
|
| 231 |
else:
|
| 232 |
output_audio_path = None
|
| 233 |
|
| 234 |
-
return answer, output_audio_path
|
|
|
|
| 193 |
|
| 194 |
# Reset session with the current image and visual context
|
| 195 |
session.reset(image, annotated_image, visual_context)
|
|
|
|
| 196 |
|
| 197 |
if audio_path:
|
| 198 |
audio_text = whisper_pipe(audio_path)["text"]
|
| 199 |
+
question += " " + audio_text.strip()
|
| 200 |
|
| 201 |
+
# Add user's new question to the history
|
| 202 |
session.add_question(question)
|
| 203 |
|
|
|
|
|
|
|
| 204 |
|
| 205 |
+
# Sends current image and current question to Gemma 3
|
| 206 |
+
gemma_output = gemma_pipe(
|
| 207 |
+
image=session.current_image,
|
| 208 |
+
question=question,
|
| 209 |
+
max_new_tokens=500
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
# Debugging (optional)
|
| 213 |
+
#print("Gemma Output:", gemma_output)
|
| 214 |
|
| 215 |
+
# Handle output format safely
|
| 216 |
if isinstance(gemma_output, list) and len(gemma_output) > 0:
|
| 217 |
gemma_text = gemma_output[0]["generated_text"][-1]["content"]
|
| 218 |
if isinstance(gemma_text, str):
|
| 219 |
answer = gemma_text
|
| 220 |
else:
|
|
|
|
| 221 |
answer = "No valid generated text found."
|
| 222 |
else:
|
| 223 |
+
answer = "No valid output from Gemma model."
|
|
|
|
|
|
|
| 224 |
|
| 225 |
+
# Save assistant's answer into session history
|
| 226 |
session.add_answer(answer)
|
| 227 |
|
| 228 |
+
# Text-to-speech output
|
| 229 |
output_audio_path = "response.wav"
|
| 230 |
if enable_tts:
|
| 231 |
tts.tts_to_file(text=answer, file_path=output_audio_path)
|
| 232 |
else:
|
| 233 |
output_audio_path = None
|
| 234 |
|
| 235 |
+
return answer, output_audio_path
|