sajjadpsavoji commited on
Commit
e05ac72
·
1 Parent(s): f79b0ac
Files changed (1) hide show
  1. app.py +305 -91
app.py CHANGED
@@ -48,7 +48,7 @@ def extract_final_answer(text: str) -> str:
48
  matches = FINAL_ANSWER_RE.findall(text)
49
  if matches:
50
  return matches[-1].strip()
51
- return text.strip()
52
 
53
  def is_number(s: str) -> bool:
54
  try:
@@ -106,6 +106,56 @@ def fast_heuristic_match(pred: str, gold: str) -> bool:
106
  return True
107
  return False
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  # --- Gold Answers Loader ---
110
  class GoldAnswers:
111
  """
@@ -182,14 +232,12 @@ class JudgeAgent:
182
  direct use of model.generate signatures — this mirrors the GAIA agent path.
183
  """
184
  def __init__(self, base_model: OpenAIServerModel, verbose: bool = False):
185
- # Reuse the exact same OpenAIServerModel instance (base_model)
186
  self.verbose = verbose
187
- # No tools required for judging; keep it simple
188
  self.agent = CodeAgent(
189
  tools=[],
190
  model=base_model,
191
- add_base_tools=False, # no need for memory/python exec for judging
192
- planning_interval=0, # no re-planning needed
193
  verbosity_level=2 if verbose else 0,
194
  additional_authorized_imports=[]
195
  )
@@ -199,14 +247,10 @@ class JudgeAgent:
199
  if fast_heuristic_match(predicted, gold):
200
  return {"is_correct": True, "score": 1.0, "justification": "Heuristic match."}
201
 
202
- # Build a single prompt that includes the system guidance and the user content.
203
- # With CodeAgent, we put the system message at the top of the prompt text.
204
  prompt = f"{JUDGE_SYSTEM}\n\n{build_judge_prompt(question, predicted, gold)}"
205
-
206
  try:
207
- raw = self.agent.run(prompt) # returns a string via the same path as GAIAAgent
208
  text = (raw or "").strip()
209
- # Extract the JSON object
210
  m = re.search(r"\{.*\}", text, flags=re.DOTALL)
211
  payload = json.loads(m.group(0) if m else text)
212
 
@@ -215,7 +259,6 @@ class JudgeAgent:
215
  justification = str(payload.get("justification", "")).strip()[:300]
216
 
217
  return {"is_correct": is_correct, "score": score, "justification": justification}
218
-
219
  except Exception as e:
220
  return {"is_correct": False, "score": 0.0, "justification": f"Judge error: {e}"}
221
 
@@ -276,28 +319,26 @@ class GAIAAgent:
276
  "sdrawkcab" in text
277
  )
278
 
279
- def __call__(self, question: str) -> str:
280
- if self.verbose:
281
- print(f"Processing question: {question[:100]}..." if len(question) > 100 else f"Processing question: {question}")
282
-
283
- if self._is_reversed_text(question):
284
- prompt = f"""
285
- You are a general AI assistant. I will ask you a question.
286
-
287
- This question appears to be in reversed text. Here is the reversed version for clarity:
288
- {question[::-1]}
289
-
290
- Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
291
 
292
  YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
293
  - If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
294
  - If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
295
- - If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
 
 
296
 
297
  IMPORTANT NOTES TO LIMIT COSTS AND PREVENT ERRORS:
298
  - Use web search sparingly and only when absolutely necessary.
299
- - Limit to 1-2 web searches per question.
300
- - If a search fails due to rate limiting, add a 3-5 second delay using time.sleep() before retrying with a different search term.
301
  - Do not import libraries that aren't available - stick to basic Python and the tools provided.
302
  - Focus on answering directly with what you already know when possible.
303
  - If you've made more than 3 attempts to solve a problem, prioritize providing your best guess.
@@ -305,21 +346,29 @@ IMPORTANT NOTES TO LIMIT COSTS AND PREVENT ERRORS:
305
 
306
  Remember to structure your response in Python code format using the final_answer() function.
307
  """
308
- else:
309
- prompt = f"""
310
- You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
 
 
 
 
 
 
 
 
 
 
 
311
 
312
  YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
313
  - If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
314
  - If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
315
- - If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
316
-
317
- Question: {question}
318
 
319
  IMPORTANT NOTES TO LIMIT COSTS AND PREVENT ERRORS:
320
  - Use web search sparingly and only when absolutely necessary.
321
- - Limit to 1-2 web searches per question.
322
- - If a search fails due to rate limiting, add a 3-5 second delay using time.sleep() before retrying with a different search term.
323
  - Do not import libraries that aren't available - stick to basic Python and the tools provided.
324
  - Focus on answering directly with what you already know when possible.
325
  - If you've made more than 3 attempts to solve a problem, prioritize providing your best guess.
@@ -327,6 +376,17 @@ IMPORTANT NOTES TO LIMIT COSTS AND PREVENT ERRORS:
327
 
328
  Remember to structure your response in Python code format using the final_answer() function.
329
  """
 
 
 
 
 
 
 
 
 
 
 
330
  try:
331
  answer = self.agent.run(prompt)
332
  if self.verbose:
@@ -338,15 +398,55 @@ Remember to structure your response in Python code format using the final_answer
338
  print(error_msg)
339
  return error_msg
340
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
  # --- Singletons for judge/gold ---
342
  gold_answers = GoldAnswers(path=DEFAULT_GOLD_CSV)
343
  _judge_agent_singleton: Optional[JudgeAgent] = None
344
 
345
  # --- Runner & Submitter (with judge integration) ---
346
- def run_and_submit_all(sample_size: int = 0):
 
 
 
 
 
 
347
  """
348
  Fetches all questions, runs the agent on them, judges locally (if gold available),
349
- submits answers, and returns a results table for the UI.
 
 
 
350
  """
351
  username = "Gralon"
352
  print(f"Using username: {username}")
@@ -360,19 +460,14 @@ def run_and_submit_all(sample_size: int = 0):
360
  agent = GAIAAgent(verbose=True)
361
  except Exception as e:
362
  print(f"Error instantiating agent: {e}")
363
- return f"Error initializing agent: {e}", None
364
 
365
  # 1b. Init JudgeAgent once, reusing the SAME model instance
366
- global _judge_agent_singleton
367
- if _judge_agent_singleton is None:
368
- _judge_agent_singleton = JudgeAgent(base_model=agent.agent.model, verbose=False)
369
 
370
  # Derive code URL for submission
371
  space_id = os.getenv("SPACE_ID")
372
- if space_id:
373
- agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
374
- else:
375
- agent_code = "local"
376
 
377
  # 2. Fetch Questions
378
  print(f"Fetching questions from: {questions_url}")
@@ -382,62 +477,146 @@ def run_and_submit_all(sample_size: int = 0):
382
  questions_data = response.json()
383
  if not questions_data:
384
  print("Fetched questions list is empty.")
385
- return "Fetched questions list is empty or invalid format.", None
386
  print(f"Fetched {len(questions_data)} questions.")
387
  except requests.exceptions.RequestException as e:
388
  print(f"Error fetching questions: {e}")
389
- return f"Error fetching questions: {e}", None
390
  except json.JSONDecodeError as e:
391
  print(f"Error decoding JSON response from questions endpoint: {e}")
392
  print(f"Response text: {response.text[:500]}")
393
- return f"Error decoding server response for questions: {e}", None
394
  except Exception as e:
395
  print(f"An unexpected error occurred fetching questions: {e}")
396
- return f"An unexpected error occurred fetching questions: {e}", None
397
-
398
- # 3. Run Agent + Judge
399
- results_log = []
400
- answers_payload = []
401
 
 
402
  if sample_size > 0 and sample_size < len(questions_data):
403
  import random
404
  print(f"Using a sample of {sample_size} questions from {len(questions_data)} total questions")
405
  questions_data = random.sample(questions_data, sample_size)
406
 
407
  print(f"Running agent on {len(questions_data)} questions...")
 
 
 
 
408
  for i, item in enumerate(questions_data):
409
  task_id = item.get("task_id")
410
  question_text = item.get("question")
411
  if not task_id or question_text is None:
412
  print(f"Skipping item with missing task_id or question: {item}")
413
  continue
 
 
 
 
414
  try:
415
  print(f"Processing question {i+1}/{len(questions_data)}: Task ID {task_id}")
416
- submitted_answer_raw = agent(question_text)
417
- submitted_answer = extract_final_answer(submitted_answer_raw)
418
-
419
- # Local judge (if we have gold)
420
- gold = gold_answers.by_task_id.get(task_id)
421
- judge_is_correct = None
422
- judge_score = None
423
- judge_just = None
 
 
 
424
  if gold:
425
- judge_res = _judge_agent_singleton.judge(question_text, submitted_answer, gold)
426
- judge_is_correct = judge_res.get("is_correct")
427
- judge_score = judge_res.get("score")
428
- judge_just = judge_res.get("justification")
 
 
 
 
 
 
 
 
 
 
429
 
430
- answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
431
  results_log.append({
432
  "Task ID": task_id,
433
  "Question": question_text,
434
- "Submitted Answer": submitted_answer,
435
  "Gold (local)": gold if gold else "",
436
- "Judge Correct?": judge_is_correct,
437
- "Judge Score": judge_score,
438
- "Judge Note": judge_just
439
  })
440
- print(f"Successfully processed question {i+1}")
 
 
 
441
 
442
  if i < len(questions_data) - 1:
443
  print("Waiting 2 seconds before next question...")
@@ -457,7 +636,7 @@ def run_and_submit_all(sample_size: int = 0):
457
 
458
  if not answers_payload:
459
  print("Agent did not produce any answers to submit.")
460
- return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
461
 
462
  # 4. Prepare Submission
463
  submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
@@ -479,7 +658,8 @@ def run_and_submit_all(sample_size: int = 0):
479
  )
480
  print("Submission successful.")
481
  results_df = pd.DataFrame(results_log)
482
- return final_status, results_df
 
483
  except requests.exceptions.HTTPError as e:
484
  error_detail = f"Server responded with status {e.response.status_code}."
485
  try:
@@ -490,29 +670,50 @@ def run_and_submit_all(sample_size: int = 0):
490
  status_message = f"Submission Failed: {error_detail}"
491
  print(status_message)
492
  results_df = pd.DataFrame(results_log)
493
- return status_message, results_df
 
494
  except requests.exceptions.Timeout:
495
  status_message = "Submission Failed: The request timed out."
496
  print(status_message)
497
  results_df = pd.DataFrame(results_log)
498
- return status_message, results_df
 
499
  except requests.exceptions.RequestException as e:
500
  status_message = f"Submission Failed: Network error - {e}"
501
  print(status_message)
502
  results_df = pd.DataFrame(results_log)
503
- return status_message, results_df
 
504
  except Exception as e:
505
  status_message = f"An unexpected error occurred during submission: {e}"
506
  print(status_message)
507
  results_df = pd.DataFrame(results_log)
508
- return status_message, results_df
 
509
 
510
- def test_single_question(question: str) -> str:
511
- """Test the agent on a single question (no submission)."""
512
  try:
513
  agent = GAIAAgent(verbose=True)
514
- answer = agent(question)
515
- return answer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
516
  except Exception as e:
517
  return f"Error: {e}"
518
 
@@ -521,10 +722,8 @@ def local_judge_single(question: str, predicted: str, task_id_or_gold: str):
521
  # try task_id lookup first
522
  gold = gold_answers.by_task_id.get(task_id_or_gold, task_id_or_gold)
523
  agent = GAIAAgent(verbose=False)
524
- global _judge_agent_singleton
525
- if _judge_agent_singleton is None:
526
- _judge_agent_singleton = JudgeAgent(base_model=agent.agent.model, verbose=False)
527
- res = _judge_agent_singleton.judge(question, predicted, gold)
528
  out = {
529
  "Gold": gold,
530
  "is_correct": res["is_correct"],
@@ -535,7 +734,7 @@ def local_judge_single(question: str, predicted: str, task_id_or_gold: str):
535
 
536
  # --- Build Gradio Interface using Blocks ---
537
  with gr.Blocks() as demo:
538
- gr.Markdown("# GAIA Agent Evaluation Runner + Local LLM Judge")
539
  gr.Markdown(
540
  """
541
  ## Instructions:
@@ -545,7 +744,7 @@ with gr.Blocks() as demo:
545
  3. Run the full evaluation on the GAIA benchmark in the Evaluation tab
546
 
547
  This agent runs locally, uses an LLM judge against your answers.csv (if present),
548
- and then submits answers to the server.
549
  """
550
  )
551
 
@@ -553,12 +752,13 @@ with gr.Blocks() as demo:
553
 
554
  with gr.Tab("Test Single Question"):
555
  test_input = gr.Textbox(label="Enter a question to test", lines=3)
 
556
  test_output = gr.Textbox(label="Answer", lines=3)
557
  test_button = gr.Button("Test Question")
558
 
559
  test_button.click(
560
  fn=test_single_question,
561
- inputs=test_input,
562
  outputs=test_output
563
  )
564
 
@@ -579,15 +779,29 @@ with gr.Blocks() as demo:
579
  label="Sample Size (0 for all questions)",
580
  info="Set a number to limit how many questions to process (reduces costs)"
581
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
582
 
583
- run_button = gr.Button("Run Evaluation, Judge Locally & Submit")
584
  status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
585
- results_table = gr.DataFrame(label="Questions, Answers & Local Judge", wrap=True)
 
586
 
587
  run_button.click(
588
  fn=run_and_submit_all,
589
- inputs=sample_size,
590
- outputs=[status_output, results_table]
591
  )
592
 
593
  if __name__ == "__main__":
 
48
  matches = FINAL_ANSWER_RE.findall(text)
49
  if matches:
50
  return matches[-1].strip()
51
+ return (text or "").strip()
52
 
53
  def is_number(s: str) -> bool:
54
  try:
 
106
  return True
107
  return False
108
 
109
+ def quick_format_fix(answer: str, question: str) -> str:
110
+ """
111
+ Deterministic, judge-friendly cleanup. We DO NOT use gold here.
112
+ - Remove leading articles for strings
113
+ - Strip currency & percent unless explicitly requested by question
114
+ - Remove thousands commas in numbers
115
+ - Trim trailing punctuation
116
+ - Normalize whitespace
117
+ - Unify separators to comma for list-like strings
118
+ """
119
+ if not isinstance(answer, str):
120
+ return answer
121
+
122
+ s = answer.strip()
123
+
124
+ # remove code fences around final answer if any
125
+ s = re.sub(r"^```.*?\n", "", s, flags=re.DOTALL)
126
+ s = s.replace("```", "").strip()
127
+
128
+ # normalize whitespace
129
+ s = re.sub(r"\s+", " ", s).strip()
130
+
131
+ # drop trailing period if looks like a sentence end
132
+ s = re.sub(r"[.。]+$", "", s)
133
+
134
+ # if list-like but uses semicolons or slashes, convert to commas
135
+ if ";" in s or "/" in s:
136
+ s = re.sub(r"[;/]+", ",", s)
137
+ s = re.sub(r"\s*,\s*", ", ", s) # pretty spacing
138
+
139
+ # remove leading articles for string-y answers
140
+ s = re.sub(r"^(?i)(a|an|the)\s+", "", s)
141
+
142
+ # remove thousands commas in numbers like 1,234 -> 1234 (but keep commas that separate lists)
143
+ # crude heuristic: if the whole answer is numeric-with-commas and no other commas
144
+ if "," in s and not re.search(r".*,.*", s): # only one comma group
145
+ if re.fullmatch(r"\d{1,3}(,\d{3})+(\.\d+)?", s):
146
+ s = s.replace(",", "")
147
+
148
+ # remove currency unless explicitly requested
149
+ if "$" in s and not re.search(r"(?i)\b(dollar|usd|\$)\b.*(include|keep|use)|include\s*\$", question):
150
+ s = s.replace("$", "")
151
+
152
+ # percent sign rules: keep only if question appears to require it explicitly
153
+ needs_percent = bool(re.search(r"(?i)\b(percent|%)\b.*(include|with|as sign)|include\s*%", question))
154
+ if "%" in s and not needs_percent:
155
+ s = s.replace("%", "")
156
+
157
+ return s.strip()
158
+
159
  # --- Gold Answers Loader ---
160
  class GoldAnswers:
161
  """
 
232
  direct use of model.generate signatures — this mirrors the GAIA agent path.
233
  """
234
  def __init__(self, base_model: OpenAIServerModel, verbose: bool = False):
 
235
  self.verbose = verbose
 
236
  self.agent = CodeAgent(
237
  tools=[],
238
  model=base_model,
239
+ add_base_tools=False,
240
+ planning_interval=0,
241
  verbosity_level=2 if verbose else 0,
242
  additional_authorized_imports=[]
243
  )
 
247
  if fast_heuristic_match(predicted, gold):
248
  return {"is_correct": True, "score": 1.0, "justification": "Heuristic match."}
249
 
 
 
250
  prompt = f"{JUDGE_SYSTEM}\n\n{build_judge_prompt(question, predicted, gold)}"
 
251
  try:
252
+ raw = self.agent.run(prompt)
253
  text = (raw or "").strip()
 
254
  m = re.search(r"\{.*\}", text, flags=re.DOTALL)
255
  payload = json.loads(m.group(0) if m else text)
256
 
 
259
  justification = str(payload.get("justification", "")).strip()[:300]
260
 
261
  return {"is_correct": is_correct, "score": score, "justification": justification}
 
262
  except Exception as e:
263
  return {"is_correct": False, "score": 0.0, "justification": f"Judge error: {e}"}
264
 
 
319
  "sdrawkcab" in text
320
  )
321
 
322
+ def _base_prompt(self, question: str, allow_extra_searches: bool = False) -> str:
323
+ # Let retries slightly relax the search budget
324
+ search_budget_line = (
325
+ "- Limit to 1-2 web searches per question.\n"
326
+ if not allow_extra_searches else
327
+ "- You may use up to 3-4 web searches if needed.\n"
328
+ )
329
+ return f"""
330
+ You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
 
 
 
331
 
332
  YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
333
  - If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
334
  - If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
335
+ - If you are asked for a comma separated list, apply the above rules depending on whether the element to be put in the list is a number or a string.
336
+
337
+ Question: {question}
338
 
339
  IMPORTANT NOTES TO LIMIT COSTS AND PREVENT ERRORS:
340
  - Use web search sparingly and only when absolutely necessary.
341
+ {search_budget_line}- If a search fails due to rate limiting, add a 3-5 second delay using time.sleep() before retrying with a different search term.
 
342
  - Do not import libraries that aren't available - stick to basic Python and the tools provided.
343
  - Focus on answering directly with what you already know when possible.
344
  - If you've made more than 3 attempts to solve a problem, prioritize providing your best guess.
 
346
 
347
  Remember to structure your response in Python code format using the final_answer() function.
348
  """
349
+
350
+ def _reversed_prompt(self, question: str, allow_extra_searches: bool = False) -> str:
351
+ search_budget_line = (
352
+ "- Limit to 1-2 web searches per question.\n"
353
+ if not allow_extra_searches else
354
+ "- You may use up to 3-4 web searches if needed.\n"
355
+ )
356
+ return f"""
357
+ You are a general AI assistant. I will ask you a question.
358
+
359
+ This question appears to be in reversed text. Here is the reversed version for clarity:
360
+ {question[::-1]}
361
+
362
+ Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
363
 
364
  YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
365
  - If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
366
  - If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
367
+ - If you are asked for a comma separated list, apply the above rules depending on whether the element to be put in the list is a number or a string.
 
 
368
 
369
  IMPORTANT NOTES TO LIMIT COSTS AND PREVENT ERRORS:
370
  - Use web search sparingly and only when absolutely necessary.
371
+ {search_budget_line}- If a search fails due to rate limiting, add a 3-5 second delay using time.sleep() before retrying with a different search term.
 
372
  - Do not import libraries that aren't available - stick to basic Python and the tools provided.
373
  - Focus on answering directly with what you already know when possible.
374
  - If you've made more than 3 attempts to solve a problem, prioritize providing your best guess.
 
376
 
377
  Remember to structure your response in Python code format using the final_answer() function.
378
  """
379
+
380
+ def __call__(self, question: str, allow_extra_searches: bool = False) -> str:
381
+ if self.verbose:
382
+ msg = f"Processing question: {question[:100]}..." if len(question) > 100 else f"Processing question: {question}"
383
+ print(msg)
384
+
385
+ prompt = (
386
+ self._reversed_prompt(question, allow_extra_searches)
387
+ if self._is_reversed_text(question)
388
+ else self._base_prompt(question, allow_extra_searches)
389
+ )
390
  try:
391
  answer = self.agent.run(prompt)
392
  if self.verbose:
 
398
  print(error_msg)
399
  return error_msg
400
 
401
+ def refine(self, question: str, prev_answer: str, judge_feedback: str, attempt_no: int) -> str:
402
+ """
403
+ Reflection-based reattempt without using gold.
404
+ """
405
+ if self.verbose:
406
+ print(f"Refining (attempt {attempt_no}) based on judge note: {judge_feedback}")
407
+
408
+ allow_extra = attempt_no >= 2 # relax search budget after first retry
409
+ base = self._base_prompt(question, allow_extra_searches=allow_extra)
410
+
411
+ refinement_addendum = f"""
412
+ Your previous FINAL ANSWER was:
413
+ {prev_answer}
414
+
415
+ A strict judge said this answer was incorrect for the following reason(s) (be concise): {judge_feedback}
416
+
417
+ Re-evaluate the question carefully. Consider possible formatting issues (units, articles, thousands commas), list ordering (only if the question requires a specific order), and rounding.
418
+ Produce a NEW final answer. Do not repeat the previous final answer if you think it was wrong.
419
+ """
420
+
421
+ try:
422
+ answer = self.agent.run(base + refinement_addendum)
423
+ if self.verbose:
424
+ print(f"Refined answer: {answer}")
425
+ return answer
426
+ except Exception as e:
427
+ err = f"Error refining: {e}"
428
+ if self.verbose:
429
+ print(err)
430
+ return err
431
+
432
  # --- Singletons for judge/gold ---
433
  gold_answers = GoldAnswers(path=DEFAULT_GOLD_CSV)
434
  _judge_agent_singleton: Optional[JudgeAgent] = None
435
 
436
  # --- Runner & Submitter (with judge integration) ---
437
+ def _ensure_judge(model: OpenAIServerModel) -> JudgeAgent:
438
+ global _judge_agent_singleton
439
+ if _judge_agent_singleton is None:
440
+ _judge_agent_singleton = JudgeAgent(base_model=model, verbose=False)
441
+ return _judge_agent_singleton
442
+
443
+ def run_and_submit_all(sample_size: int = 0, max_retries: int = 1, use_local_judge_to_select: bool = True):
444
  """
445
  Fetches all questions, runs the agent on them, judges locally (if gold available),
446
+ optionally reattempts on incorrect results, submits answers, and returns:
447
+ - final status string
448
+ - final results dataframe (one row per question)
449
+ - attempt log dataframe (one row per attempt)
450
  """
451
  username = "Gralon"
452
  print(f"Using username: {username}")
 
460
  agent = GAIAAgent(verbose=True)
461
  except Exception as e:
462
  print(f"Error instantiating agent: {e}")
463
+ return f"Error initializing agent: {e}", None, None
464
 
465
  # 1b. Init JudgeAgent once, reusing the SAME model instance
466
+ judge_agent = _ensure_judge(agent.agent.model)
 
 
467
 
468
  # Derive code URL for submission
469
  space_id = os.getenv("SPACE_ID")
470
+ agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "local"
 
 
 
471
 
472
  # 2. Fetch Questions
473
  print(f"Fetching questions from: {questions_url}")
 
477
  questions_data = response.json()
478
  if not questions_data:
479
  print("Fetched questions list is empty.")
480
+ return "Fetched questions list is empty or invalid format.", None, None
481
  print(f"Fetched {len(questions_data)} questions.")
482
  except requests.exceptions.RequestException as e:
483
  print(f"Error fetching questions: {e}")
484
+ return f"Error fetching questions: {e}", None, None
485
  except json.JSONDecodeError as e:
486
  print(f"Error decoding JSON response from questions endpoint: {e}")
487
  print(f"Response text: {response.text[:500]}")
488
+ return f"Error decoding server response for questions: {e}", None, None
489
  except Exception as e:
490
  print(f"An unexpected error occurred fetching questions: {e}")
491
+ return f"An unexpected error occurred fetching questions: {e}", None, None
 
 
 
 
492
 
493
+ # Sampling
494
  if sample_size > 0 and sample_size < len(questions_data):
495
  import random
496
  print(f"Using a sample of {sample_size} questions from {len(questions_data)} total questions")
497
  questions_data = random.sample(questions_data, sample_size)
498
 
499
  print(f"Running agent on {len(questions_data)} questions...")
500
+ results_log: List[Dict[str, Any]] = []
501
+ attempts_log: List[Dict[str, Any]] = []
502
+ answers_payload: List[Dict[str, Any]] = []
503
+
504
  for i, item in enumerate(questions_data):
505
  task_id = item.get("task_id")
506
  question_text = item.get("question")
507
  if not task_id or question_text is None:
508
  print(f"Skipping item with missing task_id or question: {item}")
509
  continue
510
+
511
+ gold = gold_answers.by_task_id.get(task_id)
512
+ per_question_attempts: List[Dict[str, Any]] = []
513
+
514
  try:
515
  print(f"Processing question {i+1}/{len(questions_data)}: Task ID {task_id}")
516
+
517
+ # -- First attempt
518
+ raw = agent(question_text, allow_extra_searches=False)
519
+ ans = extract_final_answer(raw)
520
+ fixed = quick_format_fix(ans, question_text) or ans
521
+
522
+ # judge first (on fixed)
523
+ jres = None
524
+ j_is_correct = None
525
+ j_score = None
526
+ j_note = None
527
  if gold:
528
+ jres = judge_agent.judge(question_text, fixed, gold)
529
+ j_is_correct = jres.get("is_correct")
530
+ j_score = jres.get("score")
531
+ j_note = jres.get("justification")
532
+
533
+ per_question_attempts.append({
534
+ "Task ID": task_id,
535
+ "Attempt": 1,
536
+ "Submitted Answer (raw)": ans,
537
+ "Submitted Answer (fixed)": fixed,
538
+ "Judge Correct?": j_is_correct,
539
+ "Judge Score": j_score,
540
+ "Judge Note": j_note
541
+ })
542
 
543
+ best_answer = fixed
544
+ best_score = j_score if j_score is not None else 0.0
545
+ best_correct = j_is_correct
546
+
547
+ retries = 0
548
+ while (j_is_correct is False) and (retries < max_retries):
549
+ retries += 1
550
+
551
+ # Try reflective retry
552
+ refined_raw = agent.refine(
553
+ question=question_text,
554
+ prev_answer=fixed,
555
+ judge_feedback=j_note or "Format/content mismatch.",
556
+ attempt_no=retries
557
+ )
558
+ refined = extract_final_answer(refined_raw)
559
+ refined_fixed = quick_format_fix(refined, question_text) or refined
560
+
561
+ # Judge the refined answer
562
+ j2 = None
563
+ j2_is_correct = None
564
+ j2_score = None
565
+ j2_note = None
566
+ if gold:
567
+ j2 = judge_agent.judge(question_text, refined_fixed, gold)
568
+ j2_is_correct = j2.get("is_correct")
569
+ j2_score = j2.get("score")
570
+ j2_note = j2.get("justification")
571
+
572
+ per_question_attempts.append({
573
+ "Task ID": task_id,
574
+ "Attempt": retries + 1,
575
+ "Submitted Answer (raw)": refined,
576
+ "Submitted Answer (fixed)": refined_fixed,
577
+ "Judge Correct?": j2_is_correct,
578
+ "Judge Score": j2_score,
579
+ "Judge Note": j2_note
580
+ })
581
+
582
+ # Decide whether to keep this as best
583
+ if use_local_judge_to_select and gold and (j2_score is not None):
584
+ if (j2_score > (best_score or 0)) or (best_score is None):
585
+ best_answer, best_score, best_correct = refined_fixed, j2_score, j2_is_correct
586
+ else:
587
+ # If we don't have gold/judge, prefer the newest answer
588
+ best_answer = refined_fixed
589
+ best_score = j2_score if j2_score is not None else best_score
590
+ best_correct = j2_is_correct if j2_is_correct is not None else best_correct
591
+
592
+ # Prepare for another retry if needed
593
+ fixed = refined_fixed
594
+ j_is_correct = j2_is_correct
595
+ j_score = j2_score
596
+ j_note = j2_note
597
+
598
+ if j2_is_correct:
599
+ break
600
+
601
+ if retries < max_retries:
602
+ print("Waiting 2 seconds before next attempt...")
603
+ time.sleep(2)
604
+
605
+ # Append final choice per question
606
+ answers_payload.append({"task_id": task_id, "submitted_answer": best_answer})
607
  results_log.append({
608
  "Task ID": task_id,
609
  "Question": question_text,
610
+ "Submitted Answer": best_answer,
611
  "Gold (local)": gold if gold else "",
612
+ "Judge Correct?": best_correct,
613
+ "Judge Score": best_score,
614
+ "Judge Note": j_note
615
  })
616
+ print(f"Finished question {i+1}")
617
+
618
+ # Add to global attempts log
619
+ attempts_log.extend(per_question_attempts)
620
 
621
  if i < len(questions_data) - 1:
622
  print("Waiting 2 seconds before next question...")
 
636
 
637
  if not answers_payload:
638
  print("Agent did not produce any answers to submit.")
639
+ return "Agent did not produce any answers to submit.", pd.DataFrame(results_log), pd.DataFrame(attempts_log)
640
 
641
  # 4. Prepare Submission
642
  submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
 
658
  )
659
  print("Submission successful.")
660
  results_df = pd.DataFrame(results_log)
661
+ attempts_df = pd.DataFrame(attempts_log)
662
+ return final_status, results_df, attempts_df
663
  except requests.exceptions.HTTPError as e:
664
  error_detail = f"Server responded with status {e.response.status_code}."
665
  try:
 
670
  status_message = f"Submission Failed: {error_detail}"
671
  print(status_message)
672
  results_df = pd.DataFrame(results_log)
673
+ attempts_df = pd.DataFrame(attempts_log)
674
+ return status_message, results_df, attempts_df
675
  except requests.exceptions.Timeout:
676
  status_message = "Submission Failed: The request timed out."
677
  print(status_message)
678
  results_df = pd.DataFrame(results_log)
679
+ attempts_df = pd.DataFrame(attempts_log)
680
+ return status_message, results_df, attempts_df
681
  except requests.exceptions.RequestException as e:
682
  status_message = f"Submission Failed: Network error - {e}"
683
  print(status_message)
684
  results_df = pd.DataFrame(results_log)
685
+ attempts_df = pd.DataFrame(attempts_log)
686
+ return status_message, results_df, attempts_df
687
  except Exception as e:
688
  status_message = f"An unexpected error occurred during submission: {e}"
689
  print(status_message)
690
  results_df = pd.DataFrame(results_log)
691
+ attempts_df = pd.DataFrame(attempts_log)
692
+ return status_message, results_df, attempts_df
693
 
694
+ def test_single_question(question: str, retries: int = 1) -> str:
695
+ """Test the agent on a single question (no submission), with judge-aware retries if gold exists."""
696
  try:
697
  agent = GAIAAgent(verbose=True)
698
+ judge_agent = _ensure_judge(agent.agent.model)
699
+ gold = None # No task_id context here; pure test (no gold)
700
+ # First attempt
701
+ raw = agent(question)
702
+ ans = extract_final_answer(raw)
703
+ fixed = quick_format_fix(ans, question) or ans
704
+
705
+ if retries <= 0:
706
+ return fixed
707
+
708
+ # Without gold we can't know correctness; just do a reflective retry once for demo
709
+ last = fixed
710
+ note = "Possible format/content mismatch; re-evaluate."
711
+ for k in range(retries):
712
+ refined_raw = agent.refine(question, prev_answer=last, judge_feedback=note, attempt_no=k+1)
713
+ refined = extract_final_answer(refined_raw)
714
+ refined_fixed = quick_format_fix(refined, question) or refined
715
+ last = refined_fixed
716
+ return last
717
  except Exception as e:
718
  return f"Error: {e}"
719
 
 
722
  # try task_id lookup first
723
  gold = gold_answers.by_task_id.get(task_id_or_gold, task_id_or_gold)
724
  agent = GAIAAgent(verbose=False)
725
+ judge_agent = _ensure_judge(agent.agent.model)
726
+ res = judge_agent.judge(question, predicted, gold)
 
 
727
  out = {
728
  "Gold": gold,
729
  "is_correct": res["is_correct"],
 
734
 
735
  # --- Build Gradio Interface using Blocks ---
736
  with gr.Blocks() as demo:
737
+ gr.Markdown("# GAIA Agent Evaluation Runner + Local LLM Judge (with smart retries)")
738
  gr.Markdown(
739
  """
740
  ## Instructions:
 
744
  3. Run the full evaluation on the GAIA benchmark in the Evaluation tab
745
 
746
  This agent runs locally, uses an LLM judge against your answers.csv (if present),
747
+ **retries intelligently** when the judge says 'incorrect', and then submits answers to the server.
748
  """
749
  )
750
 
 
752
 
753
  with gr.Tab("Test Single Question"):
754
  test_input = gr.Textbox(label="Enter a question to test", lines=3)
755
+ test_retries = gr.Slider(minimum=0, maximum=3, value=1, step=1, label="Retries (no gold here, heuristic only)")
756
  test_output = gr.Textbox(label="Answer", lines=3)
757
  test_button = gr.Button("Test Question")
758
 
759
  test_button.click(
760
  fn=test_single_question,
761
+ inputs=[test_input, test_retries],
762
  outputs=test_output
763
  )
764
 
 
779
  label="Sample Size (0 for all questions)",
780
  info="Set a number to limit how many questions to process (reduces costs)"
781
  )
782
+ max_retries = gr.Slider(
783
+ minimum=0,
784
+ maximum=3,
785
+ value=1,
786
+ step=1,
787
+ label="Max judge-driven retries per question",
788
+ info="0 = no retries; 1-3 = progressively more effort"
789
+ )
790
+ use_local = gr.Checkbox(
791
+ value=True,
792
+ label="Use local judge (gold) to pick best attempt when available",
793
+ info="If unchecked, we submit the last attempt instead."
794
+ )
795
 
796
+ run_button = gr.Button("Run Evaluation, Judge Locally, Retry & Submit")
797
  status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
798
+ results_table = gr.DataFrame(label="Final Results (per question)", wrap=True)
799
+ attempts_table = gr.DataFrame(label="Attempt Log (expanded)", wrap=True)
800
 
801
  run_button.click(
802
  fn=run_and_submit_all,
803
+ inputs=[sample_size, max_retries, use_local],
804
+ outputs=[status_output, results_table, attempts_table]
805
  )
806
 
807
  if __name__ == "__main__":