mdicio commited on
Commit
b922347
·
1 Parent(s): b1b39b4
Files changed (1) hide show
  1. agent.py +150 -90
agent.py CHANGED
@@ -95,6 +95,7 @@ class BoomBot:
95
  )
96
  elif self.provider == "meta":
97
  meta_model = "meta-llama/Llama-3.3-70B-Instruct-Turbo"
 
98
  # return OpenAIServerModel(
99
  # model_id=meta_model,
100
  # api_base="https://api.deepinfra.com/v1/openai",
@@ -147,29 +148,24 @@ class BoomBot:
147
  download_file,
148
  read_file_content,
149
  visit_webpage,
150
- transcribe_video,
151
  transcribe_audio,
152
  get_wikipedia_info,
153
  arxiv_search,
154
  add_doc_vectorstore,
155
  retrieve_doc_vectorstore,
156
- image_question_answering,
157
  python_interpreter,
158
  final_answer,
159
  ]
160
 
161
  # Additional imports for the Python interpreter
162
  additional_imports = [
 
163
  "json",
164
  "os",
165
  "glob",
166
  "pathlib",
167
- "pandas",
168
- "numpy",
169
- "matplotlib",
170
- "seaborn",
171
- "sklearn",
172
- "tqdm",
173
  "argparse",
174
  "pickle",
175
  "io",
@@ -182,8 +178,20 @@ class BoomBot:
182
  "zipfile",
183
  "itertools",
184
  "functools",
185
- "open",
186
- "requests"
 
 
 
 
 
 
 
 
 
 
 
 
187
  ]
188
 
189
  # Create the agent
@@ -211,64 +219,68 @@ class BoomBot:
211
  """
212
  return """
213
  YOUR BEHAVIOR GUIDELINES:
214
- • Do NOT make unfounded assumptions—always ground answers in reliable sources or search results.
215
- • For math or puzzles: break the problem into code/math, then solve programmatically.
216
-
217
  RESEARCH WORKFLOW:
218
- 1. SEARCH
219
- - Try web_search, wikipedia_search, or arxiv_search first.
220
- - Refine your query rather than repeating the exact same terms.
221
- - If one search tool yields insufficient info, switch to another before downloading.
222
- 2. VISIT
223
- - Use visit_webpage to extract and read page content when a promising link appears after one of the SEARCH tools.
224
- - For each visited link, also download the file and add to the vector store, you might need to query this later, especially if you have a lot of search results.
225
- 3. EVALUATE
226
- - If the page or search snippet fully answers the question, respond immediately.
227
- - ❌ If not, move on to deeper investigation.
228
- 4. DOWNLOAD
229
- - Use download_file_from_link tool on relevant links found (yes you can download webpages as html).
230
- - For arXiv papers, target the /pdf/ or DOI link (e.g https://arxiv.org/pdf/2011.10672).
231
- 5. INDEX & QUERY
232
- - Add downloaded documents to the vector store with add_document_to_vector_store.
233
- - Use query_downloaded_documents for detailed answers.
234
- 6. READ
235
- - You have access to a read_file_content tool to read most types of files (html, pdf, text).
236
- - You can also directly interact with downloaded files (csv, excel) in your python code.
237
- - Use query_downloaded_documents if you have added docs to vector store.
238
-
 
 
 
 
 
 
 
239
  FALLBACK & ADAPTATION:
240
- • If a tool fails, reformulate your query or try a different search method before dropping to download.
241
- If a tool fails multiple times, try a different tool.
242
- • For arXiv: you might discover a paper link via web_search tool and then directly use download_file_from_link tool
243
-
244
- COMMON TOOL CHAINS (conceptual outlines):
245
- These are just guidelines, each task might require a unique workflow.
246
- A tool can provide useful information for the task, it will not always contain the answer. You need to work to get to a final_answer that makes sense.
247
-
248
- FACTUAL Qs:
249
- web_search final_answer
250
- CURRENT EVENTS:
251
- To have some summary information use web_search, that might output a promising website to visit and read content from using (visit_webpage or download_file_from_link and read_file_content)
252
- web_search → visit_webpage → final_answer
253
- • DOCUMENT-BASED Qs:
254
- web_search → download_file_from_link → add_document_to_vector_store → query_downloaded_documents → final_answer
255
- • ARXIV PAPERS:
256
- The arxiv search tool provides a list of results with summary content, to inspect the whole paper you need to download it with download_file_from_link tool.
257
- arxiv_search download_file_from_link → read_file_content
258
- If that fails
259
- arxiv_search → download_file_from_link → add_document_to_vector_store → query_downloaded_documents
260
- • MEDIA ANALYSIS:
261
- download_file_from_link → transcribe_video/transcribe_audio/describe_image → final_answer
262
-
263
  FINAL ANSWER FORMAT:
264
- ** Do not name any python variables final_answer, this causes problems with tools.
265
- - Begin with "FINAL ANSWER: "
266
- - Numberdigits only (e.g., 42) no units unless specified
267
- - Stringexact text (e.g., Pope Francis) without quotation marks
268
- - List comma-separated, one space, no brackets unless specified(e.g., 2, 3, 4)
269
- - Conclude with: FINAL ANSWER: <your_answer>
270
  """
271
 
 
272
  def run(self, question: str, task_id: str, to_download) -> str:
273
  """
274
  Run the agent with the given question, task_id, and download flag.
@@ -307,53 +319,101 @@ class BoomBot:
307
 
308
 
309
  if __name__ == "__main__":
 
 
310
  import time
311
- from utils import load_online_qas, extract_final_answer
312
  import requests
313
- import json
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
 
315
  agent = BoomBot(provider="gemma")
316
- file_online = load_online_qas(file_path = r"../../Final_Assignment_Template/allqas.jsonl", has_file=True)
317
- results = []
 
 
318
 
319
  excluded_keywords = ["youtube", "video", "chess"]
 
320
 
 
321
  for entry in file_online:
322
- task_id = entry["task_id"]
323
- question = entry["Question"]
324
  real_answer = entry["Final answer"]
325
- file_name = entry.get("file_name", "")
326
- to_download = file_name != ""
327
- link = f"https://agents-course-unit4-scoring.hf.space/files/{task_id}"
328
 
329
- # Check exclusion and file availability
330
  if any(kw in question.lower() for kw in excluded_keywords):
331
- llm_answer = "NOT ATTEMPTED"
332
- processed_answer = llm_answer
333
  else:
334
  try:
335
- response = requests.get(link)
336
- if response.status_code != 200:
337
- llm_answer = "NOT ATTEMPTED"
338
- processed_answer = llm_answer
339
  else:
340
-
341
  llm_answer = agent.run(question, task_id, to_download)
342
- processed_answer = str(extract_final_answer(llm_answer))
343
  # time.sleep(10)
344
  except Exception as e:
345
- llm_answer = processed_answer = f"[Error] {e}"
346
  # time.sleep(6)
347
 
348
- results.append({
349
- "question": question,
350
- "llm_answer": llm_answer,
351
- "processed_answer": processed_answer.strip(),
352
- "real_answer": real_answer
 
 
353
  })
 
354
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
  print("REAL ANSWER:", real_answer)
356
 
357
- # Save all results to file
358
- with open("llm_eval.json", "w", encoding="utf-8") as f:
359
- json.dump(results, f, indent=2, ensure_ascii=False)
 
 
 
95
  )
96
  elif self.provider == "meta":
97
  meta_model = "meta-llama/Llama-3.3-70B-Instruct-Turbo"
98
+ meta_model = "Qwen/Qwen2.5-72B-Instruct"
99
  # return OpenAIServerModel(
100
  # model_id=meta_model,
101
  # api_base="https://api.deepinfra.com/v1/openai",
 
148
  download_file,
149
  read_file_content,
150
  visit_webpage,
151
+ # transcribe_video,
152
  transcribe_audio,
153
  get_wikipedia_info,
154
  arxiv_search,
155
  add_doc_vectorstore,
156
  retrieve_doc_vectorstore,
157
+ # image_question_answering,
158
  python_interpreter,
159
  final_answer,
160
  ]
161
 
162
  # Additional imports for the Python interpreter
163
  additional_imports = [
164
+ # Built-in / core Python
165
  "json",
166
  "os",
167
  "glob",
168
  "pathlib",
 
 
 
 
 
 
169
  "argparse",
170
  "pickle",
171
  "io",
 
178
  "zipfile",
179
  "itertools",
180
  "functools",
181
+ "requests",
182
+ "bs4",
183
+ # Data handling
184
+ "pandas",
185
+ "numpy",
186
+ "dask", # For handling large datasets
187
+ "polars", # Fast DataFrame alternative
188
+ "pyarrow", # For Arrow/Parquet file formats
189
+ "h5py", # For HDF5 files
190
+ "openpyxl", # Excel reading/writing
191
+ "yaml", # Config file parsing
192
+ # Basic plotting
193
+ "matplotlib",
194
+ "seaborn"
195
  ]
196
 
197
  # Create the agent
 
219
  """
220
  return """
221
  YOUR BEHAVIOR GUIDELINES:
222
+ • Do NOT make unfounded assumptions—always ground answers in reliable sources or search results.
223
+ • For math or puzzles: break the problem into code/math, then solve programmatically.
224
+
225
  RESEARCH WORKFLOW:
226
+ 1. SEARCH
227
+ - Begin with web_search, wikipedia_search, or arxiv_search.
228
+ - Refine your query if results are weak—don't just retry the same terms.
229
+ - If one search tool yields little, try another before moving on to downloads.
230
+
231
+ 2. VISIT
232
+ - Use visit_webpage to preview content from promising links.
233
+ - If the content is long, complex, spans multiple pages, or may be needed later, do NOT rely solely on visit_webpage.
234
+ - Move quickly to downloading: avoid repeated visits when the content should be archived.
235
+
236
+ 3. DOWNLOAD (MANDATORY IF CONTENT IS LONG, DENSE, OR CRUCIAL)
237
+ - Use download_file_from_link on all valuable resources (including html pages or pdfs).
238
+ - Especially when a page is detailed, technical, or multi-part, downloading is preferred.
239
+ - You can (and should) download webpages as HTML. Do this whenever the site might be referenced again later.
240
+
241
+ 4. INDEX & QUERY
242
+ - Immediately add downloaded files to the vector store using add_document_to_vector_store.
243
+ - For complex tasks or unclear answers, prefer querying vector store over re-visiting pages.
244
+ - If you've downloaded a file, **always index it unless clearly irrelevant.**
245
+
246
+ 5. READ
247
+ - Use read_file_content to analyze file contents (html, pdf, text).
248
+ - You can also use query_downloaded_documents for deeper understanding.
249
+
250
+ 6. EVALUATE
251
+ - ✅ If the answer is clear from current sources, respond.
252
+ - ❌ If not, continue iterating and analyzing downloaded material.
253
+
254
  FALLBACK & ADAPTATION:
255
+ • If a tool fails, reformulate or switch tools.
256
+ For arXiv: web_search might help you find the paper; follow with direct download of the PDF via download_file_from_link.
257
+
258
+ MANDATORY DOWNLOAD & INDEX WHEN:
259
+ The page is lengthy or technical (e.g., research papers, government sites, legal docs, blog posts with code).
260
+ You suspect you'll need to return to the content.
261
+ You are working on multi-hop reasoning or long-term memory tasks.
262
+
263
+ COMMON TOOL CHAINS:
264
+ FACTUAL Qs:
265
+ web_search final_answer
266
+ CURRENT EVENTS:
267
+ web_search → visit_webpage → (download + index if needed) → final_answer
268
+ • DOCUMENT-BASED Qs:
269
+ web_search → download_file_from_link → add_document_to_vector_store → query_downloaded_documents → final_answer
270
+ • ARXIV PAPERS:
271
+ arxiv_search download_file_from_link add_document_to_vector_store query_downloaded_documents final_answer
272
+ MEDIA ANALYSIS:
273
+ download_file_from_link transcribe_audio → final_answer
274
+
 
 
 
275
  FINAL ANSWER FORMAT:
276
+ - Begin with "FINAL ANSWER: "
277
+ - Number digits only (e.g., 42)
278
+ - Stringexact text (e.g., Pope Francis) without quotation marks
279
+ - Listcomma-separated, no brackets unless specified (e.g., 2, 3, 4)
280
+ - End with: FINAL ANSWER: <your_answer>
 
281
  """
282
 
283
+
284
  def run(self, question: str, task_id: str, to_download) -> str:
285
  """
286
  Run the agent with the given question, task_id, and download flag.
 
319
 
320
 
321
  if __name__ == "__main__":
322
+ import os
323
+ import csv
324
  import time
 
325
  import requests
326
+ from utils import load_online_qas, extract_final_answer
327
+
328
+ CSV_FILE = "evals/llm_eval.csv"
329
+ FIELDNAMES = ["model", "task_id", "question", "llm_answer", "processed_answer", "real_answer"]
330
+
331
+ def ensure_csv():
332
+ """Create the CSV file with header if it doesn't exist."""
333
+ if not os.path.isfile(CSV_FILE):
334
+ with open(CSV_FILE, mode="w", newline="", encoding="utf-8") as f:
335
+ writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
336
+ writer.writeheader()
337
+
338
+ def append_results(rows):
339
+ """Append a list of dict rows to the CSV."""
340
+ with open(CSV_FILE, mode="a", newline="", encoding="utf-8") as f:
341
+ writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
342
+ for row in rows:
343
+ writer.writerow(row)
344
 
345
  agent = BoomBot(provider="gemma")
346
+ model_name = agent.provider # e.g. "gemma"
347
+
348
+ file_online = load_online_qas(file_path=r"../../Final_Assignment_Template/allqas.jsonl", has_file=True)
349
+ nofile_online = load_online_qas(file_path=r"../../Final_Assignment_Template/allqas.jsonl", has_file=False)
350
 
351
  excluded_keywords = ["youtube", "video", "chess"]
352
+ rows_to_append = []
353
 
354
+ # 1) With downloadable files
355
  for entry in file_online:
356
+ task_id = entry["task_id"]
357
+ question = entry["Question"]
358
  real_answer = entry["Final answer"]
359
+ file_name = entry.get("file_name", "")
360
+ to_download = bool(file_name)
361
+ link = f"https://agents-course-unit4-scoring.hf.space/files/{task_id}"
362
 
 
363
  if any(kw in question.lower() for kw in excluded_keywords):
364
+ llm_answer = processed = "NOT ATTEMPTED"
 
365
  else:
366
  try:
367
+ resp = requests.get(link)
368
+ if resp.status_code != 200:
369
+ llm_answer = processed = "NOT ATTEMPTED"
 
370
  else:
 
371
  llm_answer = agent.run(question, task_id, to_download)
372
+ processed = extract_final_answer(llm_answer).strip()
373
  # time.sleep(10)
374
  except Exception as e:
375
+ llm_answer = processed = f"[Error] {e}"
376
  # time.sleep(6)
377
 
378
+ rows_to_append.append({
379
+ "model": model_name,
380
+ "task_id": task_id,
381
+ "question": question,
382
+ "llm_answer": llm_answer,
383
+ "processed_answer": processed,
384
+ "real_answer": real_answer,
385
  })
386
+ print("REAL ANSWER:", real_answer)
387
 
388
+ # 2) Without downloadable files
389
+ for entry in nofile_online:
390
+ task_id = entry["task_id"]
391
+ question = entry["Question"]
392
+ real_answer = entry["Final answer"]
393
+
394
+ if any(kw in question.lower() for kw in excluded_keywords):
395
+ llm_answer = processed = "NOT ATTEMPTED"
396
+ else:
397
+ try:
398
+ llm_answer = agent.run(question, task_id, to_download=False)
399
+ processed = extract_final_answer(llm_answer).strip()
400
+ # time.sleep(10)
401
+ except Exception as e:
402
+ llm_answer = processed = f"[Error] {e}"
403
+ # time.sleep(6)
404
+
405
+ rows_to_append.append({
406
+ "model": model_name,
407
+ "task_id": task_id,
408
+ "question": question,
409
+ "llm_answer": llm_answer,
410
+ "processed_answer": processed,
411
+ "real_answer": real_answer,
412
+ })
413
  print("REAL ANSWER:", real_answer)
414
 
415
+ # ensure CSV exists and append
416
+ ensure_csv()
417
+ append_results(rows_to_append)
418
+
419
+ print(f"✅ Appended {len(rows_to_append)} rows to {CSV_FILE}")