saa231 commited on
Commit
8435f69
·
verified ·
1 Parent(s): 903dad9

Update project_model.py

Browse files
Files changed (1) hide show
  1. project_model.py +16 -15
project_model.py CHANGED
@@ -193,42 +193,43 @@ def process_inputs(
193
 
194
  # Reset session with the current image and visual context
195
  session.reset(image, annotated_image, visual_context)
196
-
197
 
198
  if audio_path:
199
  audio_text = whisper_pipe(audio_path)["text"]
200
- question += " " + audio_text
201
 
 
202
  session.add_question(question)
203
 
204
- # Send prompt to Gemma 3
205
- gemma_output = gemma_pipe(text=session.message_history, max_new_tokens=500)
206
 
207
- # Debugging: Check the output structure
208
- print("Gemma Output:", gemma_output)
209
- print("Type of Gemma Output:", type(gemma_output))
 
 
 
 
 
 
210
 
211
- # Make sure the output is in the expected format (a string)
212
  if isinstance(gemma_output, list) and len(gemma_output) > 0:
213
  gemma_text = gemma_output[0]["generated_text"][-1]["content"]
214
  if isinstance(gemma_text, str):
215
  answer = gemma_text
216
  else:
217
- # Handle unexpected formats or empty text
218
  answer = "No valid generated text found."
219
  else:
220
- answer = "No valid output from Gemma model"
221
-
222
- # answer = gemma_output[0]["generated_text"][-1]["content"]
223
 
224
- # Append GEMMA's response to the history to maintain alternating structure
225
  session.add_answer(answer)
226
 
227
- # If TTS is enabled, synthesize answer as speech
228
  output_audio_path = "response.wav"
229
  if enable_tts:
230
  tts.tts_to_file(text=answer, file_path=output_audio_path)
231
  else:
232
  output_audio_path = None
233
 
234
- return answer, output_audio_path
 
193
 
194
  # Reset session with the current image and visual context
195
  session.reset(image, annotated_image, visual_context)
 
196
 
197
  if audio_path:
198
  audio_text = whisper_pipe(audio_path)["text"]
199
+ question += " " + audio_text.strip()
200
 
201
+ # Add user's new question to the history
202
  session.add_question(question)
203
 
 
 
204
 
205
+ # Sends current image and current question to Gemma 3
206
+ gemma_output = gemma_pipe(
207
+ image=session.current_image,
208
+ question=question,
209
+ max_new_tokens=500
210
+ )
211
+
212
+ # Debugging (optional)
213
+ #print("Gemma Output:", gemma_output)
214
 
215
+ # Handle output format safely
216
  if isinstance(gemma_output, list) and len(gemma_output) > 0:
217
  gemma_text = gemma_output[0]["generated_text"][-1]["content"]
218
  if isinstance(gemma_text, str):
219
  answer = gemma_text
220
  else:
 
221
  answer = "No valid generated text found."
222
  else:
223
+ answer = "No valid output from Gemma model."
 
 
224
 
225
+ # Save assistant's answer into session history
226
  session.add_answer(answer)
227
 
228
+ # Text-to-speech output
229
  output_audio_path = "response.wav"
230
  if enable_tts:
231
  tts.tts_to_file(text=answer, file_path=output_audio_path)
232
  else:
233
  output_audio_path = None
234
 
235
+ return answer, output_audio_path