feat: Implement audio analysis tools, enhance agent reasoning with a multi-step ReAct loop, and add local submission backup.
Browse files- agent.py +76 -30
- app.py +9 -0
- requirements.txt +1 -0
agent.py
CHANGED
|
@@ -16,11 +16,21 @@ from langchain_groq import ChatGroq
|
|
| 16 |
from langchain_community.document_loaders.image import UnstructuredImageLoader
|
| 17 |
from langchain_community.document_loaders import WebBaseLoader
|
| 18 |
import base64
|
|
|
|
| 19 |
try:
|
| 20 |
import cv2
|
| 21 |
except ImportError:
|
| 22 |
cv2 = None
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
load_dotenv()
|
| 25 |
|
| 26 |
# Base Hugging Face LLM used by the chat wrapper
|
|
@@ -124,6 +134,23 @@ def analyze_image(image_path: str, question: str) -> str:
|
|
| 124 |
except Exception as e:
|
| 125 |
return f"Error analyzing image: {str(e)}"
|
| 126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
@tool
|
| 128 |
def analyze_video(video_path: str, question: str) -> str:
|
| 129 |
"""
|
|
@@ -172,7 +199,17 @@ def analyze_video(video_path: str, question: str) -> str:
|
|
| 172 |
# 2. Compile the context for the agent
|
| 173 |
video_context = "\n".join(extracted_descriptions)
|
| 174 |
|
| 175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
except Exception as e:
|
| 177 |
return f"Error analyzing video: {str(e)}"
|
| 178 |
|
|
@@ -281,7 +318,7 @@ def restart_required(state: AgentState) -> AgentState:
|
|
| 281 |
# return {"messages": messages + [response]}
|
| 282 |
|
| 283 |
# Augment the LLM with tools
|
| 284 |
-
tools = [web_search, wiki_search, analyze_image, analyze_video, read_url, run_python_script, read_document]
|
| 285 |
tools_by_name = {tool.name: tool for tool in tools}
|
| 286 |
model_with_tools = model.bind_tools(tools)
|
| 287 |
|
|
@@ -297,7 +334,7 @@ def answer_message(state: AgentState) -> AgentState:
|
|
| 297 |
TODAY'S EXACT DATE is {current_date}. Keep this in mind for all time-sensitive queries.
|
| 298 |
|
| 299 |
CRITICAL RULES FOR SEARCH & TOOLS:
|
| 300 |
-
1. If a file is attached, use the appropriate tool (run_python_script, read_document, analyze_image, analyze_video) to answer the question based on the file content.
|
| 301 |
2. Use run_python_script freely to process data (pandas), read complex documents (.xlsx, .pdf), or do heavy math calculations.
|
| 302 |
3. When using tools like web_search or wiki_search, do not blindly search the entire question. Extract the core entities.
|
| 303 |
4. If the first search result doesn't contain the answer, THINK step-by-step, refine your search query (e.g., use synonyms, or search for broader concepts), and search again.
|
|
@@ -316,33 +353,42 @@ def answer_message(state: AgentState) -> AgentState:
|
|
| 316 |
""")]
|
| 317 |
messages = prompt + messages
|
| 318 |
|
| 319 |
-
#
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
"
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 346 |
|
| 347 |
# Third pass: strict GAIA formatting extraction
|
| 348 |
formatting_sys = SystemMessage(
|
|
|
|
| 16 |
from langchain_community.document_loaders.image import UnstructuredImageLoader
|
| 17 |
from langchain_community.document_loaders import WebBaseLoader
|
| 18 |
import base64
|
| 19 |
+
|
| 20 |
try:
|
| 21 |
import cv2
|
| 22 |
except ImportError:
|
| 23 |
cv2 = None
|
| 24 |
|
| 25 |
+
whisper_model = None
|
| 26 |
+
def get_whisper():
|
| 27 |
+
global whisper_model
|
| 28 |
+
if whisper_model is None:
|
| 29 |
+
import whisper
|
| 30 |
+
# Lazy load the smallest, fastest model
|
| 31 |
+
whisper_model = whisper.load_model("base")
|
| 32 |
+
return whisper_model
|
| 33 |
+
|
| 34 |
load_dotenv()
|
| 35 |
|
| 36 |
# Base Hugging Face LLM used by the chat wrapper
|
|
|
|
| 134 |
except Exception as e:
|
| 135 |
return f"Error analyzing image: {str(e)}"
|
| 136 |
|
| 137 |
+
@tool
|
| 138 |
+
def analyze_audio(audio_path: str, question: str) -> str:
|
| 139 |
+
"""
|
| 140 |
+
Transcribes an audio file (.mp3, .wav, .m4a) to answer questions about what is spoken.
|
| 141 |
+
|
| 142 |
+
Args:
|
| 143 |
+
audio_path: The local path to the audio file.
|
| 144 |
+
question: The specific question to ask.
|
| 145 |
+
"""
|
| 146 |
+
try:
|
| 147 |
+
model = get_whisper()
|
| 148 |
+
result = model.transcribe(audio_path)
|
| 149 |
+
transcript = result["text"]
|
| 150 |
+
return f"Audio Transcript:\n{transcript}"
|
| 151 |
+
except Exception as e:
|
| 152 |
+
return f"Error analyzing audio: {str(e)}. Tip: You requires 'ffmpeg' installed on your system."
|
| 153 |
+
|
| 154 |
@tool
|
| 155 |
def analyze_video(video_path: str, question: str) -> str:
|
| 156 |
"""
|
|
|
|
| 199 |
# 2. Compile the context for the agent
|
| 200 |
video_context = "\n".join(extracted_descriptions)
|
| 201 |
|
| 202 |
+
# 3. Transcribe audio if possible
|
| 203 |
+
try:
|
| 204 |
+
whisper_mod = get_whisper()
|
| 205 |
+
trans_result = whisper_mod.transcribe(video_path)
|
| 206 |
+
transcript = trans_result.get("text", "")
|
| 207 |
+
if transcript.strip():
|
| 208 |
+
video_context += f"\n\nVideo Audio Transcript:\n{transcript}"
|
| 209 |
+
except Exception as e:
|
| 210 |
+
video_context += f"\n\n(No audio transcript generated: {e})"
|
| 211 |
+
|
| 212 |
+
return f"Video Summary based on extracted frames and audio:\n{video_context}"
|
| 213 |
except Exception as e:
|
| 214 |
return f"Error analyzing video: {str(e)}"
|
| 215 |
|
|
|
|
| 318 |
# return {"messages": messages + [response]}
|
| 319 |
|
| 320 |
# Augment the LLM with tools
|
| 321 |
+
tools = [web_search, wiki_search, analyze_image, analyze_audio, analyze_video, read_url, run_python_script, read_document]
|
| 322 |
tools_by_name = {tool.name: tool for tool in tools}
|
| 323 |
model_with_tools = model.bind_tools(tools)
|
| 324 |
|
|
|
|
| 334 |
TODAY'S EXACT DATE is {current_date}. Keep this in mind for all time-sensitive queries.
|
| 335 |
|
| 336 |
CRITICAL RULES FOR SEARCH & TOOLS:
|
| 337 |
+
1. If a file is attached, use the appropriate tool (run_python_script, read_document, analyze_image, analyze_audio, analyze_video) to answer the question based on the file content.
|
| 338 |
2. Use run_python_script freely to process data (pandas), read complex documents (.xlsx, .pdf), or do heavy math calculations.
|
| 339 |
3. When using tools like web_search or wiki_search, do not blindly search the entire question. Extract the core entities.
|
| 340 |
4. If the first search result doesn't contain the answer, THINK step-by-step, refine your search query (e.g., use synonyms, or search for broader concepts), and search again.
|
|
|
|
| 353 |
""")]
|
| 354 |
messages = prompt + messages
|
| 355 |
|
| 356 |
+
# Multi-step ReAct Loop (Up to 8 reasoning steps)
|
| 357 |
+
max_steps = 8
|
| 358 |
+
draft_response = None
|
| 359 |
+
|
| 360 |
+
for step in range(max_steps):
|
| 361 |
+
print(f"--- ReAct Step {step + 1} ---")
|
| 362 |
+
ai_msg = model_with_tools.invoke(messages)
|
| 363 |
+
messages.append(ai_msg)
|
| 364 |
+
|
| 365 |
+
# Check if the model requested tools
|
| 366 |
+
tool_calls = getattr(ai_msg, "tool_calls", None) or []
|
| 367 |
+
if not tool_calls:
|
| 368 |
+
# Model decided it has enough info to answer
|
| 369 |
+
draft_response = ai_msg
|
| 370 |
+
print(f"Model found answer or stopped tools: {ai_msg.content}")
|
| 371 |
+
break
|
| 372 |
+
|
| 373 |
+
# Execute requested tools and append their text output into the conversation
|
| 374 |
+
for tool_call in tool_calls:
|
| 375 |
+
name = tool_call["name"]
|
| 376 |
+
args = tool_call["args"]
|
| 377 |
+
print(f"Calling tool: {name} with args: {args}")
|
| 378 |
+
try:
|
| 379 |
+
tool = tools_by_name[name]
|
| 380 |
+
tool_result = tool.invoke(args)
|
| 381 |
+
except Exception as e:
|
| 382 |
+
tool_result = f"Error executing tool {name}: {str(e)}"
|
| 383 |
+
|
| 384 |
+
messages.append(HumanMessage(content=f"Tool result ({name}):\n{tool_result}"))
|
| 385 |
+
|
| 386 |
+
# If we exhausted all steps without an answer, force a draft response
|
| 387 |
+
if draft_response is None:
|
| 388 |
+
print("Max reasoning steps reached. Forcing answer extraction.")
|
| 389 |
+
forced_msg = HumanMessage(content="You have reached the maximum reasoning steps. Please provide your best final answer based on the current context without any more tool calls.")
|
| 390 |
+
messages.append(forced_msg)
|
| 391 |
+
draft_response = model.invoke(messages)
|
| 392 |
|
| 393 |
# Third pass: strict GAIA formatting extraction
|
| 394 |
formatting_sys = SystemMessage(
|
app.py
CHANGED
|
@@ -159,6 +159,15 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
| 159 |
status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
|
| 160 |
print(status_update)
|
| 161 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
# 5. Submit
|
| 163 |
print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
|
| 164 |
try:
|
|
|
|
| 159 |
status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
|
| 160 |
print(status_update)
|
| 161 |
|
| 162 |
+
# Backup locally just in case the HF submission server 500 crashes
|
| 163 |
+
import json
|
| 164 |
+
try:
|
| 165 |
+
with open("backup_submission.json", "w") as f:
|
| 166 |
+
json.dump(submission_data, f)
|
| 167 |
+
print("Answers backed up to backup_submission.json successfully.")
|
| 168 |
+
except Exception as e:
|
| 169 |
+
print(f"Could not backup answers: {e}")
|
| 170 |
+
|
| 171 |
# 5. Submit
|
| 172 |
print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
|
| 173 |
try:
|
requirements.txt
CHANGED
|
@@ -24,3 +24,4 @@ unstructured[all-docs]
|
|
| 24 |
opencv-python
|
| 25 |
beautifulsoup4
|
| 26 |
PyPDF2
|
|
|
|
|
|
| 24 |
opencv-python
|
| 25 |
beautifulsoup4
|
| 26 |
PyPDF2
|
| 27 |
+
openai-whisper
|