Paperbag commited on
Commit
717c736
·
1 Parent(s): a5ab16b
Files changed (4) hide show
  1. __pycache__/agent.cpython-39.pyc +0 -0
  2. agent.py +37 -5
  3. app copy.py +25 -19
  4. requirements.txt +2 -1
__pycache__/agent.cpython-39.pyc CHANGED
Binary files a/__pycache__/agent.cpython-39.pyc and b/__pycache__/agent.cpython-39.pyc differ
 
agent.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  from typing import TypedDict, List, Dict, Any, Optional, Union
3
  from langchain_core import tools
4
  from langgraph.graph import StateGraph, START, END
@@ -11,6 +12,7 @@ from dotenv import load_dotenv
11
  from groq import Groq
12
  from langchain_groq import ChatGroq
13
  from langchain_community.document_loaders.image import UnstructuredImageLoader
 
14
  import base64
15
  try:
16
  import cv2
@@ -65,7 +67,7 @@ def web_search(keywords: str) -> str:
65
  @tool
66
  def wiki_search(query: str) -> str:
67
  """
68
- Search Wikipedia for a query and return a maximum of 2 results
69
 
70
  Use cases:
71
  When the question requires the use of information from wikipedia
@@ -74,13 +76,17 @@ def wiki_search(query: str) -> str:
74
  query: The search query
75
  """
76
 
77
- search_docs = WikipediaLoader(query=query, load_max_docs=2).load()
 
 
 
 
78
  formatted_search_docs = "\n\n---\n\n".join(
79
  [
80
- f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content}\n</Document>'
81
  for doc in search_docs
82
  ])
83
- return {"wiki_results": formatted_search_docs}
84
 
85
 
86
 
@@ -168,6 +174,22 @@ def analyze_video(video_path: str, question: str) -> str:
168
  except Exception as e:
169
  return f"Error analyzing video: {str(e)}"
170
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
  system_prompt = """
173
  You are a helpful assistant tasked with answering questions using a set of tools.
@@ -210,16 +232,26 @@ def restart_required(state: AgentState) -> AgentState:
210
  # return {"messages": messages + [response]}
211
 
212
  # Augment the LLM with tools
213
- tools = [web_search, wiki_search, analyze_image, analyze_video]
214
  tools_by_name = {tool.name: tool for tool in tools}
215
  model_with_tools = model.bind_tools(tools)
216
 
217
  def answer_message(state: AgentState) -> AgentState:
218
  messages = state["messages"]
 
 
219
  prompt = [SystemMessage(f"""
220
  You are a GAIA question answering expert.
221
  Your task is to provide an answer to a question.
222
  Think carefully before answering the question.
 
 
 
 
 
 
 
 
223
  Do not include any thought process before answering the question, and only response exactly what was being asked of you.
224
  If you are not able to provide an answer, use tools or state the limitation that you're facing instead.
225
  If a file is attached, use the appropriate tool (analyze_image or analyze_video) to answer the question based on the file content.
 
1
  import os
2
+ import datetime
3
  from typing import TypedDict, List, Dict, Any, Optional, Union
4
  from langchain_core import tools
5
  from langgraph.graph import StateGraph, START, END
 
12
  from groq import Groq
13
  from langchain_groq import ChatGroq
14
  from langchain_community.document_loaders.image import UnstructuredImageLoader
15
+ from langchain_community.document_loaders import WebBaseLoader
16
  import base64
17
  try:
18
  import cv2
 
67
  @tool
68
  def wiki_search(query: str) -> str:
69
  """
70
+ Search Wikipedia for a query and return up to 3 results.
71
 
72
  Use cases:
73
  When the question requires the use of information from wikipedia
 
76
  query: The search query
77
  """
78
 
79
+ search_docs = WikipediaLoader(query=query, load_max_docs=3, doc_content_chars_max=15000).load()
80
+
81
+ if not search_docs:
82
+ return "No Wikipedia results found."
83
+
84
  formatted_search_docs = "\n\n---\n\n".join(
85
  [
86
+ f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("title", "Unknown Title")}"/>\n{doc.page_content}\n</Document>'
87
  for doc in search_docs
88
  ])
89
+ return formatted_search_docs
90
 
91
 
92
 
 
174
  except Exception as e:
175
  return f"Error analyzing video: {str(e)}"
176
 
177
+ @tool
178
+ def read_url(url: str) -> str:
179
+ """
180
+ Reads and extracts text from a specific webpage URL.
181
+ Use this if a web search snippet doesn't contain enough detail.
182
+ """
183
+ try:
184
+ loader = WebBaseLoader(url)
185
+ docs = loader.load()
186
+ # Truncate to first 15000 characters to fit context
187
+ if not docs:
188
+ return "No content could be extracted from this URL."
189
+ return docs[0].page_content[:15000]
190
+ except Exception as e:
191
+ return f"Error reading URL: {e}"
192
+
193
 
194
  system_prompt = """
195
  You are a helpful assistant tasked with answering questions using a set of tools.
 
232
  # return {"messages": messages + [response]}
233
 
234
  # Augment the LLM with tools
235
+ tools = [web_search, wiki_search, analyze_image, analyze_video, read_url]
236
  tools_by_name = {tool.name: tool for tool in tools}
237
  model_with_tools = model.bind_tools(tools)
238
 
239
  def answer_message(state: AgentState) -> AgentState:
240
  messages = state["messages"]
241
+ current_date = datetime.datetime.now().strftime("%Y-%m-%d")
242
+
243
  prompt = [SystemMessage(f"""
244
  You are a GAIA question answering expert.
245
  Your task is to provide an answer to a question.
246
  Think carefully before answering the question.
247
+
248
+ TODAY'S EXACT DATE is {current_date}. Keep this in mind for all time-sensitive queries.
249
+
250
+ CRITICAL RULES FOR SEARCH:
251
+ 1. When using tools like web_search or wiki_search, do not blindly search the entire question. Extract the core entities.
252
+ 2. If the first search result doesn't contain the answer, THINK step-by-step, refine your search query (e.g., use synonyms, or search for broader concepts), and search again.
253
+ 3. Cross-reference facts if they seem ambiguous.
254
+
255
  Do not include any thought process before answering the question, and only response exactly what was being asked of you.
256
  If you are not able to provide an answer, use tools or state the limitation that you're facing instead.
257
  If a file is attached, use the appropriate tool (analyze_image or analyze_video) to answer the question based on the file content.
app copy.py CHANGED
@@ -28,43 +28,49 @@ class BasicAgent:
28
  answer = result['messages'][-1].content
29
  return answer
30
 
31
- def file_extract(local_file_path,task_file_paths):
32
- if local_file_path:
 
 
 
 
 
 
33
  try:
34
- # GAIA's file_path is relative to the dataset repo root.
35
- # Download the file into the allowed cache and get its local path.
36
  resolved_path = hf_hub_download(
37
  repo_id="gaia-benchmark/GAIA",
38
- filename=local_file_path, # e.g. "2023/test/<attachment-id>.pdf"
39
  repo_type="dataset",
40
  )
41
-
42
- task_file_paths[str(task_id)] = resolved_path
43
- logger.debug(
44
- f"Stored file path mapping for task_id {task_id}: {resolved_path}"
45
- )
46
- except Exception as e:
47
- logger.warning(
48
- f"Could not download file '{local_file_path}' for task_id {task_id}: {e}. "
49
- "Mapping skipped."
50
- )
51
 
52
  agent = BasicAgent()
53
  questions_url = f"{DEFAULT_API_URL}/questions"
54
  response = requests.get(questions_url, timeout=15)
55
  response.raise_for_status()
56
  questions_data = response.json()
57
- for item in questions_data[:5]:
58
  question_text = item.get("question")
59
  if question_text is None:
60
  continue
61
  files_text = item.get("files")
62
  task_id = item.get("task_id")
63
  file_name = item.get("file_name")
 
64
  if file_name:
65
- question_text += f"\n\n[Attached File: {file_name}]"
66
- # file = file_extract(,task_id)
67
- print(files_text,task_id)
 
 
 
 
 
68
  output = agent(question_text)
69
  print("Q:", question_text)
70
  print("A:", output)
 
28
  answer = result['messages'][-1].content
29
  return answer
30
 
31
+ def file_extract(local_file_path, task_id):
32
+ if not local_file_path:
33
+ return None
34
+
35
+ # GAIA files are usually placed in date-based subdirectories
36
+ prefixes = ["2023/validation/", "2023/test/", "2023/train/", ""]
37
+
38
+ for prefix in prefixes:
39
  try:
 
 
40
  resolved_path = hf_hub_download(
41
  repo_id="gaia-benchmark/GAIA",
42
+ filename=f"{prefix}{local_file_path}",
43
  repo_type="dataset",
44
  )
45
+ return resolved_path
46
+ except Exception:
47
+ continue
48
+
49
+ logger.warning(f"Could not download file '{local_file_path}' for task_id {task_id}")
50
+ return None
 
 
 
 
51
 
52
  agent = BasicAgent()
53
  questions_url = f"{DEFAULT_API_URL}/questions"
54
  response = requests.get(questions_url, timeout=15)
55
  response.raise_for_status()
56
  questions_data = response.json()
57
+ for item in questions_data[3:4]:
58
  question_text = item.get("question")
59
  if question_text is None:
60
  continue
61
  files_text = item.get("files")
62
  task_id = item.get("task_id")
63
  file_name = item.get("file_name")
64
+
65
  if file_name:
66
+ # Actually download the file to local cache and get absolute path
67
+ resolved_path = file_extract(file_name, task_id)
68
+ if resolved_path:
69
+ question_text += f"\n\n[Attached File Local Path: {resolved_path}]"
70
+ else:
71
+ question_text += f"\n\n[Attached File: {file_name} (Download Failed)]"
72
+
73
+ print(files_text, task_id)
74
  output = agent(question_text)
75
  print("Q:", question_text)
76
  print("A:", output)
requirements.txt CHANGED
@@ -21,4 +21,5 @@ numpy
21
  ddgs
22
  groq
23
  unstructured[all-docs]
24
- opencv-python
 
 
21
  ddgs
22
  groq
23
  unstructured[all-docs]
24
+ opencv-python
25
+ beautifulsoup4