Paperbag commited on
Commit
d20527e
·
1 Parent(s): 6000e5d

feat: Implement audio analysis tools, enhance agent reasoning with a multi-step ReAct loop, and add local submission backup.

Browse files
Files changed (3) hide show
  1. agent.py +76 -30
  2. app.py +9 -0
  3. requirements.txt +1 -0
agent.py CHANGED
@@ -16,11 +16,21 @@ from langchain_groq import ChatGroq
16
  from langchain_community.document_loaders.image import UnstructuredImageLoader
17
  from langchain_community.document_loaders import WebBaseLoader
18
  import base64
 
19
  try:
20
  import cv2
21
  except ImportError:
22
  cv2 = None
23
 
 
 
 
 
 
 
 
 
 
24
  load_dotenv()
25
 
26
  # Base Hugging Face LLM used by the chat wrapper
@@ -124,6 +134,23 @@ def analyze_image(image_path: str, question: str) -> str:
124
  except Exception as e:
125
  return f"Error analyzing image: {str(e)}"
126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  @tool
128
  def analyze_video(video_path: str, question: str) -> str:
129
  """
@@ -172,7 +199,17 @@ def analyze_video(video_path: str, question: str) -> str:
172
  # 2. Compile the context for the agent
173
  video_context = "\n".join(extracted_descriptions)
174
 
175
- return f"Video Summary based on extracted frames:\n{video_context}"
 
 
 
 
 
 
 
 
 
 
176
  except Exception as e:
177
  return f"Error analyzing video: {str(e)}"
178
 
@@ -281,7 +318,7 @@ def restart_required(state: AgentState) -> AgentState:
281
  # return {"messages": messages + [response]}
282
 
283
  # Augment the LLM with tools
284
- tools = [web_search, wiki_search, analyze_image, analyze_video, read_url, run_python_script, read_document]
285
  tools_by_name = {tool.name: tool for tool in tools}
286
  model_with_tools = model.bind_tools(tools)
287
 
@@ -297,7 +334,7 @@ def answer_message(state: AgentState) -> AgentState:
297
  TODAY'S EXACT DATE is {current_date}. Keep this in mind for all time-sensitive queries.
298
 
299
  CRITICAL RULES FOR SEARCH & TOOLS:
300
- 1. If a file is attached, use the appropriate tool (run_python_script, read_document, analyze_image, analyze_video) to answer the question based on the file content.
301
  2. Use run_python_script freely to process data (pandas), read complex documents (.xlsx, .pdf), or do heavy math calculations.
302
  3. When using tools like web_search or wiki_search, do not blindly search the entire question. Extract the core entities.
303
  4. If the first search result doesn't contain the answer, THINK step-by-step, refine your search query (e.g., use synonyms, or search for broader concepts), and search again.
@@ -316,33 +353,42 @@ def answer_message(state: AgentState) -> AgentState:
316
  """)]
317
  messages = prompt + messages
318
 
319
- # First pass: let model decide whether to call web_search
320
- ai_msg = model_with_tools.invoke(messages)
321
- messages.append(ai_msg)
322
-
323
- # If the model didn't request any tools, its content is already the answer
324
- tool_calls = getattr(ai_msg, "tool_calls", None) or []
325
- if not tool_calls:
326
- print(f"Final response: {ai_msg}")
327
- return {"messages": messages}
328
-
329
- # Execute requested tools and append their text output into the conversation
330
- for tool_call in tool_calls:
331
- name = tool_call["name"]
332
- args = tool_call["args"]
333
- tool = tools_by_name[name]
334
- tool_result = tool.invoke(args) # this is a plain string from web_search
335
- messages.append(HumanMessage(content=f"Tool result ({name}):\n{tool_result}"))
336
-
337
- # Second pass: force a plain-text final answer (no tool calls expected)
338
- final_instruction = HumanMessage(
339
- content=(
340
- "Using the tool results above, provide the FINAL numeric/text answer now. "
341
- "Do not call any tools. Provide exactly what was asked."
342
- )
343
- )
344
- messages.append(final_instruction)
345
- draft_response = model.invoke(messages)
 
 
 
 
 
 
 
 
 
346
 
347
  # Third pass: strict GAIA formatting extraction
348
  formatting_sys = SystemMessage(
 
16
  from langchain_community.document_loaders.image import UnstructuredImageLoader
17
  from langchain_community.document_loaders import WebBaseLoader
18
  import base64
19
+
20
  try:
21
  import cv2
22
  except ImportError:
23
  cv2 = None
24
 
25
+ whisper_model = None
26
+ def get_whisper():
27
+ global whisper_model
28
+ if whisper_model is None:
29
+ import whisper
30
+ # Lazy load the smallest, fastest model
31
+ whisper_model = whisper.load_model("base")
32
+ return whisper_model
33
+
34
  load_dotenv()
35
 
36
  # Base Hugging Face LLM used by the chat wrapper
 
134
  except Exception as e:
135
  return f"Error analyzing image: {str(e)}"
136
 
137
+ @tool
138
+ def analyze_audio(audio_path: str, question: str) -> str:
139
+ """
140
+ Transcribes an audio file (.mp3, .wav, .m4a) to answer questions about what is spoken.
141
+
142
+ Args:
143
+ audio_path: The local path to the audio file.
144
+ question: The specific question to ask.
145
+ """
146
+ try:
147
+ model = get_whisper()
148
+ result = model.transcribe(audio_path)
149
+ transcript = result["text"]
150
+ return f"Audio Transcript:\n{transcript}"
151
+ except Exception as e:
152
+ return f"Error analyzing audio: {str(e)}. Tip: You requires 'ffmpeg' installed on your system."
153
+
154
  @tool
155
  def analyze_video(video_path: str, question: str) -> str:
156
  """
 
199
  # 2. Compile the context for the agent
200
  video_context = "\n".join(extracted_descriptions)
201
 
202
+ # 3. Transcribe audio if possible
203
+ try:
204
+ whisper_mod = get_whisper()
205
+ trans_result = whisper_mod.transcribe(video_path)
206
+ transcript = trans_result.get("text", "")
207
+ if transcript.strip():
208
+ video_context += f"\n\nVideo Audio Transcript:\n{transcript}"
209
+ except Exception as e:
210
+ video_context += f"\n\n(No audio transcript generated: {e})"
211
+
212
+ return f"Video Summary based on extracted frames and audio:\n{video_context}"
213
  except Exception as e:
214
  return f"Error analyzing video: {str(e)}"
215
 
 
318
  # return {"messages": messages + [response]}
319
 
320
  # Augment the LLM with tools
321
+ tools = [web_search, wiki_search, analyze_image, analyze_audio, analyze_video, read_url, run_python_script, read_document]
322
  tools_by_name = {tool.name: tool for tool in tools}
323
  model_with_tools = model.bind_tools(tools)
324
 
 
334
  TODAY'S EXACT DATE is {current_date}. Keep this in mind for all time-sensitive queries.
335
 
336
  CRITICAL RULES FOR SEARCH & TOOLS:
337
+ 1. If a file is attached, use the appropriate tool (run_python_script, read_document, analyze_image, analyze_audio, analyze_video) to answer the question based on the file content.
338
  2. Use run_python_script freely to process data (pandas), read complex documents (.xlsx, .pdf), or do heavy math calculations.
339
  3. When using tools like web_search or wiki_search, do not blindly search the entire question. Extract the core entities.
340
  4. If the first search result doesn't contain the answer, THINK step-by-step, refine your search query (e.g., use synonyms, or search for broader concepts), and search again.
 
353
  """)]
354
  messages = prompt + messages
355
 
356
+ # Multi-step ReAct Loop (Up to 8 reasoning steps)
357
+ max_steps = 8
358
+ draft_response = None
359
+
360
+ for step in range(max_steps):
361
+ print(f"--- ReAct Step {step + 1} ---")
362
+ ai_msg = model_with_tools.invoke(messages)
363
+ messages.append(ai_msg)
364
+
365
+ # Check if the model requested tools
366
+ tool_calls = getattr(ai_msg, "tool_calls", None) or []
367
+ if not tool_calls:
368
+ # Model decided it has enough info to answer
369
+ draft_response = ai_msg
370
+ print(f"Model found answer or stopped tools: {ai_msg.content}")
371
+ break
372
+
373
+ # Execute requested tools and append their text output into the conversation
374
+ for tool_call in tool_calls:
375
+ name = tool_call["name"]
376
+ args = tool_call["args"]
377
+ print(f"Calling tool: {name} with args: {args}")
378
+ try:
379
+ tool = tools_by_name[name]
380
+ tool_result = tool.invoke(args)
381
+ except Exception as e:
382
+ tool_result = f"Error executing tool {name}: {str(e)}"
383
+
384
+ messages.append(HumanMessage(content=f"Tool result ({name}):\n{tool_result}"))
385
+
386
+ # If we exhausted all steps without an answer, force a draft response
387
+ if draft_response is None:
388
+ print("Max reasoning steps reached. Forcing answer extraction.")
389
+ forced_msg = HumanMessage(content="You have reached the maximum reasoning steps. Please provide your best final answer based on the current context without any more tool calls.")
390
+ messages.append(forced_msg)
391
+ draft_response = model.invoke(messages)
392
 
393
  # Third pass: strict GAIA formatting extraction
394
  formatting_sys = SystemMessage(
app.py CHANGED
@@ -159,6 +159,15 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
159
  status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
160
  print(status_update)
161
 
 
 
 
 
 
 
 
 
 
162
  # 5. Submit
163
  print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
164
  try:
 
159
  status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
160
  print(status_update)
161
 
162
+ # Backup locally just in case the HF submission server 500 crashes
163
+ import json
164
+ try:
165
+ with open("backup_submission.json", "w") as f:
166
+ json.dump(submission_data, f)
167
+ print("Answers backed up to backup_submission.json successfully.")
168
+ except Exception as e:
169
+ print(f"Could not backup answers: {e}")
170
+
171
  # 5. Submit
172
  print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
173
  try:
requirements.txt CHANGED
@@ -24,3 +24,4 @@ unstructured[all-docs]
24
  opencv-python
25
  beautifulsoup4
26
  PyPDF2
 
 
24
  opencv-python
25
  beautifulsoup4
26
  PyPDF2
27
+ openai-whisper