sajjadpsavoji commited on
Commit ·
e05ac72
1
Parent(s): f79b0ac
update
Browse files
app.py
CHANGED
|
@@ -48,7 +48,7 @@ def extract_final_answer(text: str) -> str:
|
|
| 48 |
matches = FINAL_ANSWER_RE.findall(text)
|
| 49 |
if matches:
|
| 50 |
return matches[-1].strip()
|
| 51 |
-
return text.strip()
|
| 52 |
|
| 53 |
def is_number(s: str) -> bool:
|
| 54 |
try:
|
|
@@ -106,6 +106,56 @@ def fast_heuristic_match(pred: str, gold: str) -> bool:
|
|
| 106 |
return True
|
| 107 |
return False
|
| 108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
# --- Gold Answers Loader ---
|
| 110 |
class GoldAnswers:
|
| 111 |
"""
|
|
@@ -182,14 +232,12 @@ class JudgeAgent:
|
|
| 182 |
direct use of model.generate signatures — this mirrors the GAIA agent path.
|
| 183 |
"""
|
| 184 |
def __init__(self, base_model: OpenAIServerModel, verbose: bool = False):
|
| 185 |
-
# Reuse the exact same OpenAIServerModel instance (base_model)
|
| 186 |
self.verbose = verbose
|
| 187 |
-
# No tools required for judging; keep it simple
|
| 188 |
self.agent = CodeAgent(
|
| 189 |
tools=[],
|
| 190 |
model=base_model,
|
| 191 |
-
add_base_tools=False,
|
| 192 |
-
planning_interval=0,
|
| 193 |
verbosity_level=2 if verbose else 0,
|
| 194 |
additional_authorized_imports=[]
|
| 195 |
)
|
|
@@ -199,14 +247,10 @@ class JudgeAgent:
|
|
| 199 |
if fast_heuristic_match(predicted, gold):
|
| 200 |
return {"is_correct": True, "score": 1.0, "justification": "Heuristic match."}
|
| 201 |
|
| 202 |
-
# Build a single prompt that includes the system guidance and the user content.
|
| 203 |
-
# With CodeAgent, we put the system message at the top of the prompt text.
|
| 204 |
prompt = f"{JUDGE_SYSTEM}\n\n{build_judge_prompt(question, predicted, gold)}"
|
| 205 |
-
|
| 206 |
try:
|
| 207 |
-
raw = self.agent.run(prompt)
|
| 208 |
text = (raw or "").strip()
|
| 209 |
-
# Extract the JSON object
|
| 210 |
m = re.search(r"\{.*\}", text, flags=re.DOTALL)
|
| 211 |
payload = json.loads(m.group(0) if m else text)
|
| 212 |
|
|
@@ -215,7 +259,6 @@ class JudgeAgent:
|
|
| 215 |
justification = str(payload.get("justification", "")).strip()[:300]
|
| 216 |
|
| 217 |
return {"is_correct": is_correct, "score": score, "justification": justification}
|
| 218 |
-
|
| 219 |
except Exception as e:
|
| 220 |
return {"is_correct": False, "score": 0.0, "justification": f"Judge error: {e}"}
|
| 221 |
|
|
@@ -276,28 +319,26 @@ class GAIAAgent:
|
|
| 276 |
"sdrawkcab" in text
|
| 277 |
)
|
| 278 |
|
| 279 |
-
def
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
{question[::-1]}
|
| 289 |
-
|
| 290 |
-
Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
|
| 291 |
|
| 292 |
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
|
| 293 |
- If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
|
| 294 |
- If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
|
| 295 |
-
- If you are asked for a comma separated list, apply the above rules depending
|
|
|
|
|
|
|
| 296 |
|
| 297 |
IMPORTANT NOTES TO LIMIT COSTS AND PREVENT ERRORS:
|
| 298 |
- Use web search sparingly and only when absolutely necessary.
|
| 299 |
-
-
|
| 300 |
-
- If a search fails due to rate limiting, add a 3-5 second delay using time.sleep() before retrying with a different search term.
|
| 301 |
- Do not import libraries that aren't available - stick to basic Python and the tools provided.
|
| 302 |
- Focus on answering directly with what you already know when possible.
|
| 303 |
- If you've made more than 3 attempts to solve a problem, prioritize providing your best guess.
|
|
@@ -305,21 +346,29 @@ IMPORTANT NOTES TO LIMIT COSTS AND PREVENT ERRORS:
|
|
| 305 |
|
| 306 |
Remember to structure your response in Python code format using the final_answer() function.
|
| 307 |
"""
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
|
| 312 |
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
|
| 313 |
- If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
|
| 314 |
- If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
|
| 315 |
-
- If you are asked for a comma separated list, apply the above rules depending
|
| 316 |
-
|
| 317 |
-
Question: {question}
|
| 318 |
|
| 319 |
IMPORTANT NOTES TO LIMIT COSTS AND PREVENT ERRORS:
|
| 320 |
- Use web search sparingly and only when absolutely necessary.
|
| 321 |
-
-
|
| 322 |
-
- If a search fails due to rate limiting, add a 3-5 second delay using time.sleep() before retrying with a different search term.
|
| 323 |
- Do not import libraries that aren't available - stick to basic Python and the tools provided.
|
| 324 |
- Focus on answering directly with what you already know when possible.
|
| 325 |
- If you've made more than 3 attempts to solve a problem, prioritize providing your best guess.
|
|
@@ -327,6 +376,17 @@ IMPORTANT NOTES TO LIMIT COSTS AND PREVENT ERRORS:
|
|
| 327 |
|
| 328 |
Remember to structure your response in Python code format using the final_answer() function.
|
| 329 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
try:
|
| 331 |
answer = self.agent.run(prompt)
|
| 332 |
if self.verbose:
|
|
@@ -338,15 +398,55 @@ Remember to structure your response in Python code format using the final_answer
|
|
| 338 |
print(error_msg)
|
| 339 |
return error_msg
|
| 340 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
# --- Singletons for judge/gold ---
|
| 342 |
gold_answers = GoldAnswers(path=DEFAULT_GOLD_CSV)
|
| 343 |
_judge_agent_singleton: Optional[JudgeAgent] = None
|
| 344 |
|
| 345 |
# --- Runner & Submitter (with judge integration) ---
|
| 346 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 347 |
"""
|
| 348 |
Fetches all questions, runs the agent on them, judges locally (if gold available),
|
| 349 |
-
|
|
|
|
|
|
|
|
|
|
| 350 |
"""
|
| 351 |
username = "Gralon"
|
| 352 |
print(f"Using username: {username}")
|
|
@@ -360,19 +460,14 @@ def run_and_submit_all(sample_size: int = 0):
|
|
| 360 |
agent = GAIAAgent(verbose=True)
|
| 361 |
except Exception as e:
|
| 362 |
print(f"Error instantiating agent: {e}")
|
| 363 |
-
return f"Error initializing agent: {e}", None
|
| 364 |
|
| 365 |
# 1b. Init JudgeAgent once, reusing the SAME model instance
|
| 366 |
-
|
| 367 |
-
if _judge_agent_singleton is None:
|
| 368 |
-
_judge_agent_singleton = JudgeAgent(base_model=agent.agent.model, verbose=False)
|
| 369 |
|
| 370 |
# Derive code URL for submission
|
| 371 |
space_id = os.getenv("SPACE_ID")
|
| 372 |
-
if space_id
|
| 373 |
-
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
|
| 374 |
-
else:
|
| 375 |
-
agent_code = "local"
|
| 376 |
|
| 377 |
# 2. Fetch Questions
|
| 378 |
print(f"Fetching questions from: {questions_url}")
|
|
@@ -382,62 +477,146 @@ def run_and_submit_all(sample_size: int = 0):
|
|
| 382 |
questions_data = response.json()
|
| 383 |
if not questions_data:
|
| 384 |
print("Fetched questions list is empty.")
|
| 385 |
-
return "Fetched questions list is empty or invalid format.", None
|
| 386 |
print(f"Fetched {len(questions_data)} questions.")
|
| 387 |
except requests.exceptions.RequestException as e:
|
| 388 |
print(f"Error fetching questions: {e}")
|
| 389 |
-
return f"Error fetching questions: {e}", None
|
| 390 |
except json.JSONDecodeError as e:
|
| 391 |
print(f"Error decoding JSON response from questions endpoint: {e}")
|
| 392 |
print(f"Response text: {response.text[:500]}")
|
| 393 |
-
return f"Error decoding server response for questions: {e}", None
|
| 394 |
except Exception as e:
|
| 395 |
print(f"An unexpected error occurred fetching questions: {e}")
|
| 396 |
-
return f"An unexpected error occurred fetching questions: {e}", None
|
| 397 |
-
|
| 398 |
-
# 3. Run Agent + Judge
|
| 399 |
-
results_log = []
|
| 400 |
-
answers_payload = []
|
| 401 |
|
|
|
|
| 402 |
if sample_size > 0 and sample_size < len(questions_data):
|
| 403 |
import random
|
| 404 |
print(f"Using a sample of {sample_size} questions from {len(questions_data)} total questions")
|
| 405 |
questions_data = random.sample(questions_data, sample_size)
|
| 406 |
|
| 407 |
print(f"Running agent on {len(questions_data)} questions...")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 408 |
for i, item in enumerate(questions_data):
|
| 409 |
task_id = item.get("task_id")
|
| 410 |
question_text = item.get("question")
|
| 411 |
if not task_id or question_text is None:
|
| 412 |
print(f"Skipping item with missing task_id or question: {item}")
|
| 413 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
| 414 |
try:
|
| 415 |
print(f"Processing question {i+1}/{len(questions_data)}: Task ID {task_id}")
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
|
|
|
|
|
|
|
|
|
| 424 |
if gold:
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 429 |
|
| 430 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 431 |
results_log.append({
|
| 432 |
"Task ID": task_id,
|
| 433 |
"Question": question_text,
|
| 434 |
-
"Submitted Answer":
|
| 435 |
"Gold (local)": gold if gold else "",
|
| 436 |
-
"Judge Correct?":
|
| 437 |
-
"Judge Score":
|
| 438 |
-
"Judge Note":
|
| 439 |
})
|
| 440 |
-
print(f"
|
|
|
|
|
|
|
|
|
|
| 441 |
|
| 442 |
if i < len(questions_data) - 1:
|
| 443 |
print("Waiting 2 seconds before next question...")
|
|
@@ -457,7 +636,7 @@ def run_and_submit_all(sample_size: int = 0):
|
|
| 457 |
|
| 458 |
if not answers_payload:
|
| 459 |
print("Agent did not produce any answers to submit.")
|
| 460 |
-
return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
|
| 461 |
|
| 462 |
# 4. Prepare Submission
|
| 463 |
submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
|
|
@@ -479,7 +658,8 @@ def run_and_submit_all(sample_size: int = 0):
|
|
| 479 |
)
|
| 480 |
print("Submission successful.")
|
| 481 |
results_df = pd.DataFrame(results_log)
|
| 482 |
-
|
|
|
|
| 483 |
except requests.exceptions.HTTPError as e:
|
| 484 |
error_detail = f"Server responded with status {e.response.status_code}."
|
| 485 |
try:
|
|
@@ -490,29 +670,50 @@ def run_and_submit_all(sample_size: int = 0):
|
|
| 490 |
status_message = f"Submission Failed: {error_detail}"
|
| 491 |
print(status_message)
|
| 492 |
results_df = pd.DataFrame(results_log)
|
| 493 |
-
|
|
|
|
| 494 |
except requests.exceptions.Timeout:
|
| 495 |
status_message = "Submission Failed: The request timed out."
|
| 496 |
print(status_message)
|
| 497 |
results_df = pd.DataFrame(results_log)
|
| 498 |
-
|
|
|
|
| 499 |
except requests.exceptions.RequestException as e:
|
| 500 |
status_message = f"Submission Failed: Network error - {e}"
|
| 501 |
print(status_message)
|
| 502 |
results_df = pd.DataFrame(results_log)
|
| 503 |
-
|
|
|
|
| 504 |
except Exception as e:
|
| 505 |
status_message = f"An unexpected error occurred during submission: {e}"
|
| 506 |
print(status_message)
|
| 507 |
results_df = pd.DataFrame(results_log)
|
| 508 |
-
|
|
|
|
| 509 |
|
| 510 |
-
def test_single_question(question: str) -> str:
|
| 511 |
-
"""Test the agent on a single question (no submission)."""
|
| 512 |
try:
|
| 513 |
agent = GAIAAgent(verbose=True)
|
| 514 |
-
|
| 515 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 516 |
except Exception as e:
|
| 517 |
return f"Error: {e}"
|
| 518 |
|
|
@@ -521,10 +722,8 @@ def local_judge_single(question: str, predicted: str, task_id_or_gold: str):
|
|
| 521 |
# try task_id lookup first
|
| 522 |
gold = gold_answers.by_task_id.get(task_id_or_gold, task_id_or_gold)
|
| 523 |
agent = GAIAAgent(verbose=False)
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
_judge_agent_singleton = JudgeAgent(base_model=agent.agent.model, verbose=False)
|
| 527 |
-
res = _judge_agent_singleton.judge(question, predicted, gold)
|
| 528 |
out = {
|
| 529 |
"Gold": gold,
|
| 530 |
"is_correct": res["is_correct"],
|
|
@@ -535,7 +734,7 @@ def local_judge_single(question: str, predicted: str, task_id_or_gold: str):
|
|
| 535 |
|
| 536 |
# --- Build Gradio Interface using Blocks ---
|
| 537 |
with gr.Blocks() as demo:
|
| 538 |
-
gr.Markdown("# GAIA Agent Evaluation Runner + Local LLM Judge")
|
| 539 |
gr.Markdown(
|
| 540 |
"""
|
| 541 |
## Instructions:
|
|
@@ -545,7 +744,7 @@ with gr.Blocks() as demo:
|
|
| 545 |
3. Run the full evaluation on the GAIA benchmark in the Evaluation tab
|
| 546 |
|
| 547 |
This agent runs locally, uses an LLM judge against your answers.csv (if present),
|
| 548 |
-
and then submits answers to the server.
|
| 549 |
"""
|
| 550 |
)
|
| 551 |
|
|
@@ -553,12 +752,13 @@ with gr.Blocks() as demo:
|
|
| 553 |
|
| 554 |
with gr.Tab("Test Single Question"):
|
| 555 |
test_input = gr.Textbox(label="Enter a question to test", lines=3)
|
|
|
|
| 556 |
test_output = gr.Textbox(label="Answer", lines=3)
|
| 557 |
test_button = gr.Button("Test Question")
|
| 558 |
|
| 559 |
test_button.click(
|
| 560 |
fn=test_single_question,
|
| 561 |
-
inputs=test_input,
|
| 562 |
outputs=test_output
|
| 563 |
)
|
| 564 |
|
|
@@ -579,15 +779,29 @@ with gr.Blocks() as demo:
|
|
| 579 |
label="Sample Size (0 for all questions)",
|
| 580 |
info="Set a number to limit how many questions to process (reduces costs)"
|
| 581 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 582 |
|
| 583 |
-
run_button = gr.Button("Run Evaluation, Judge Locally & Submit")
|
| 584 |
status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
|
| 585 |
-
results_table = gr.DataFrame(label="
|
|
|
|
| 586 |
|
| 587 |
run_button.click(
|
| 588 |
fn=run_and_submit_all,
|
| 589 |
-
inputs=sample_size,
|
| 590 |
-
outputs=[status_output, results_table]
|
| 591 |
)
|
| 592 |
|
| 593 |
if __name__ == "__main__":
|
|
|
|
| 48 |
matches = FINAL_ANSWER_RE.findall(text)
|
| 49 |
if matches:
|
| 50 |
return matches[-1].strip()
|
| 51 |
+
return (text or "").strip()
|
| 52 |
|
| 53 |
def is_number(s: str) -> bool:
|
| 54 |
try:
|
|
|
|
| 106 |
return True
|
| 107 |
return False
|
| 108 |
|
| 109 |
+
def quick_format_fix(answer: str, question: str) -> str:
|
| 110 |
+
"""
|
| 111 |
+
Deterministic, judge-friendly cleanup. We DO NOT use gold here.
|
| 112 |
+
- Remove leading articles for strings
|
| 113 |
+
- Strip currency & percent unless explicitly requested by question
|
| 114 |
+
- Remove thousands commas in numbers
|
| 115 |
+
- Trim trailing punctuation
|
| 116 |
+
- Normalize whitespace
|
| 117 |
+
- Unify separators to comma for list-like strings
|
| 118 |
+
"""
|
| 119 |
+
if not isinstance(answer, str):
|
| 120 |
+
return answer
|
| 121 |
+
|
| 122 |
+
s = answer.strip()
|
| 123 |
+
|
| 124 |
+
# remove code fences around final answer if any
|
| 125 |
+
s = re.sub(r"^```.*?\n", "", s, flags=re.DOTALL)
|
| 126 |
+
s = s.replace("```", "").strip()
|
| 127 |
+
|
| 128 |
+
# normalize whitespace
|
| 129 |
+
s = re.sub(r"\s+", " ", s).strip()
|
| 130 |
+
|
| 131 |
+
# drop trailing period if looks like a sentence end
|
| 132 |
+
s = re.sub(r"[.。]+$", "", s)
|
| 133 |
+
|
| 134 |
+
# if list-like but uses semicolons or slashes, convert to commas
|
| 135 |
+
if ";" in s or "/" in s:
|
| 136 |
+
s = re.sub(r"[;/]+", ",", s)
|
| 137 |
+
s = re.sub(r"\s*,\s*", ", ", s) # pretty spacing
|
| 138 |
+
|
| 139 |
+
# remove leading articles for string-y answers
|
| 140 |
+
s = re.sub(r"^(?i)(a|an|the)\s+", "", s)
|
| 141 |
+
|
| 142 |
+
# remove thousands commas in numbers like 1,234 -> 1234 (but keep commas that separate lists)
|
| 143 |
+
# crude heuristic: if the whole answer is numeric-with-commas and no other commas
|
| 144 |
+
if "," in s and not re.search(r".*,.*", s): # only one comma group
|
| 145 |
+
if re.fullmatch(r"\d{1,3}(,\d{3})+(\.\d+)?", s):
|
| 146 |
+
s = s.replace(",", "")
|
| 147 |
+
|
| 148 |
+
# remove currency unless explicitly requested
|
| 149 |
+
if "$" in s and not re.search(r"(?i)\b(dollar|usd|\$)\b.*(include|keep|use)|include\s*\$", question):
|
| 150 |
+
s = s.replace("$", "")
|
| 151 |
+
|
| 152 |
+
# percent sign rules: keep only if question appears to require it explicitly
|
| 153 |
+
needs_percent = bool(re.search(r"(?i)\b(percent|%)\b.*(include|with|as sign)|include\s*%", question))
|
| 154 |
+
if "%" in s and not needs_percent:
|
| 155 |
+
s = s.replace("%", "")
|
| 156 |
+
|
| 157 |
+
return s.strip()
|
| 158 |
+
|
| 159 |
# --- Gold Answers Loader ---
|
| 160 |
class GoldAnswers:
|
| 161 |
"""
|
|
|
|
| 232 |
direct use of model.generate signatures — this mirrors the GAIA agent path.
|
| 233 |
"""
|
| 234 |
def __init__(self, base_model: OpenAIServerModel, verbose: bool = False):
|
|
|
|
| 235 |
self.verbose = verbose
|
|
|
|
| 236 |
self.agent = CodeAgent(
|
| 237 |
tools=[],
|
| 238 |
model=base_model,
|
| 239 |
+
add_base_tools=False,
|
| 240 |
+
planning_interval=0,
|
| 241 |
verbosity_level=2 if verbose else 0,
|
| 242 |
additional_authorized_imports=[]
|
| 243 |
)
|
|
|
|
| 247 |
if fast_heuristic_match(predicted, gold):
|
| 248 |
return {"is_correct": True, "score": 1.0, "justification": "Heuristic match."}
|
| 249 |
|
|
|
|
|
|
|
| 250 |
prompt = f"{JUDGE_SYSTEM}\n\n{build_judge_prompt(question, predicted, gold)}"
|
|
|
|
| 251 |
try:
|
| 252 |
+
raw = self.agent.run(prompt)
|
| 253 |
text = (raw or "").strip()
|
|
|
|
| 254 |
m = re.search(r"\{.*\}", text, flags=re.DOTALL)
|
| 255 |
payload = json.loads(m.group(0) if m else text)
|
| 256 |
|
|
|
|
| 259 |
justification = str(payload.get("justification", "")).strip()[:300]
|
| 260 |
|
| 261 |
return {"is_correct": is_correct, "score": score, "justification": justification}
|
|
|
|
| 262 |
except Exception as e:
|
| 263 |
return {"is_correct": False, "score": 0.0, "justification": f"Judge error: {e}"}
|
| 264 |
|
|
|
|
| 319 |
"sdrawkcab" in text
|
| 320 |
)
|
| 321 |
|
| 322 |
+
def _base_prompt(self, question: str, allow_extra_searches: bool = False) -> str:
|
| 323 |
+
# Let retries slightly relax the search budget
|
| 324 |
+
search_budget_line = (
|
| 325 |
+
"- Limit to 1-2 web searches per question.\n"
|
| 326 |
+
if not allow_extra_searches else
|
| 327 |
+
"- You may use up to 3-4 web searches if needed.\n"
|
| 328 |
+
)
|
| 329 |
+
return f"""
|
| 330 |
+
You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
|
|
|
|
|
|
|
|
|
|
| 331 |
|
| 332 |
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
|
| 333 |
- If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
|
| 334 |
- If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
|
| 335 |
+
- If you are asked for a comma separated list, apply the above rules depending on whether the element to be put in the list is a number or a string.
|
| 336 |
+
|
| 337 |
+
Question: {question}
|
| 338 |
|
| 339 |
IMPORTANT NOTES TO LIMIT COSTS AND PREVENT ERRORS:
|
| 340 |
- Use web search sparingly and only when absolutely necessary.
|
| 341 |
+
{search_budget_line}- If a search fails due to rate limiting, add a 3-5 second delay using time.sleep() before retrying with a different search term.
|
|
|
|
| 342 |
- Do not import libraries that aren't available - stick to basic Python and the tools provided.
|
| 343 |
- Focus on answering directly with what you already know when possible.
|
| 344 |
- If you've made more than 3 attempts to solve a problem, prioritize providing your best guess.
|
|
|
|
| 346 |
|
| 347 |
Remember to structure your response in Python code format using the final_answer() function.
|
| 348 |
"""
|
| 349 |
+
|
| 350 |
+
def _reversed_prompt(self, question: str, allow_extra_searches: bool = False) -> str:
|
| 351 |
+
search_budget_line = (
|
| 352 |
+
"- Limit to 1-2 web searches per question.\n"
|
| 353 |
+
if not allow_extra_searches else
|
| 354 |
+
"- You may use up to 3-4 web searches if needed.\n"
|
| 355 |
+
)
|
| 356 |
+
return f"""
|
| 357 |
+
You are a general AI assistant. I will ask you a question.
|
| 358 |
+
|
| 359 |
+
This question appears to be in reversed text. Here is the reversed version for clarity:
|
| 360 |
+
{question[::-1]}
|
| 361 |
+
|
| 362 |
+
Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
|
| 363 |
|
| 364 |
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
|
| 365 |
- If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
|
| 366 |
- If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
|
| 367 |
+
- If you are asked for a comma separated list, apply the above rules depending on whether the element to be put in the list is a number or a string.
|
|
|
|
|
|
|
| 368 |
|
| 369 |
IMPORTANT NOTES TO LIMIT COSTS AND PREVENT ERRORS:
|
| 370 |
- Use web search sparingly and only when absolutely necessary.
|
| 371 |
+
{search_budget_line}- If a search fails due to rate limiting, add a 3-5 second delay using time.sleep() before retrying with a different search term.
|
|
|
|
| 372 |
- Do not import libraries that aren't available - stick to basic Python and the tools provided.
|
| 373 |
- Focus on answering directly with what you already know when possible.
|
| 374 |
- If you've made more than 3 attempts to solve a problem, prioritize providing your best guess.
|
|
|
|
| 376 |
|
| 377 |
Remember to structure your response in Python code format using the final_answer() function.
|
| 378 |
"""
|
| 379 |
+
|
| 380 |
+
def __call__(self, question: str, allow_extra_searches: bool = False) -> str:
|
| 381 |
+
if self.verbose:
|
| 382 |
+
msg = f"Processing question: {question[:100]}..." if len(question) > 100 else f"Processing question: {question}"
|
| 383 |
+
print(msg)
|
| 384 |
+
|
| 385 |
+
prompt = (
|
| 386 |
+
self._reversed_prompt(question, allow_extra_searches)
|
| 387 |
+
if self._is_reversed_text(question)
|
| 388 |
+
else self._base_prompt(question, allow_extra_searches)
|
| 389 |
+
)
|
| 390 |
try:
|
| 391 |
answer = self.agent.run(prompt)
|
| 392 |
if self.verbose:
|
|
|
|
| 398 |
print(error_msg)
|
| 399 |
return error_msg
|
| 400 |
|
| 401 |
+
def refine(self, question: str, prev_answer: str, judge_feedback: str, attempt_no: int) -> str:
|
| 402 |
+
"""
|
| 403 |
+
Reflection-based reattempt without using gold.
|
| 404 |
+
"""
|
| 405 |
+
if self.verbose:
|
| 406 |
+
print(f"Refining (attempt {attempt_no}) based on judge note: {judge_feedback}")
|
| 407 |
+
|
| 408 |
+
allow_extra = attempt_no >= 2 # relax search budget after first retry
|
| 409 |
+
base = self._base_prompt(question, allow_extra_searches=allow_extra)
|
| 410 |
+
|
| 411 |
+
refinement_addendum = f"""
|
| 412 |
+
Your previous FINAL ANSWER was:
|
| 413 |
+
{prev_answer}
|
| 414 |
+
|
| 415 |
+
A strict judge said this answer was incorrect for the following reason(s) (be concise): {judge_feedback}
|
| 416 |
+
|
| 417 |
+
Re-evaluate the question carefully. Consider possible formatting issues (units, articles, thousands commas), list ordering (only if the question requires a specific order), and rounding.
|
| 418 |
+
Produce a NEW final answer. Do not repeat the previous final answer if you think it was wrong.
|
| 419 |
+
"""
|
| 420 |
+
|
| 421 |
+
try:
|
| 422 |
+
answer = self.agent.run(base + refinement_addendum)
|
| 423 |
+
if self.verbose:
|
| 424 |
+
print(f"Refined answer: {answer}")
|
| 425 |
+
return answer
|
| 426 |
+
except Exception as e:
|
| 427 |
+
err = f"Error refining: {e}"
|
| 428 |
+
if self.verbose:
|
| 429 |
+
print(err)
|
| 430 |
+
return err
|
| 431 |
+
|
| 432 |
# --- Singletons for judge/gold ---
|
| 433 |
gold_answers = GoldAnswers(path=DEFAULT_GOLD_CSV)
|
| 434 |
_judge_agent_singleton: Optional[JudgeAgent] = None
|
| 435 |
|
| 436 |
# --- Runner & Submitter (with judge integration) ---
|
| 437 |
+
def _ensure_judge(model: OpenAIServerModel) -> JudgeAgent:
|
| 438 |
+
global _judge_agent_singleton
|
| 439 |
+
if _judge_agent_singleton is None:
|
| 440 |
+
_judge_agent_singleton = JudgeAgent(base_model=model, verbose=False)
|
| 441 |
+
return _judge_agent_singleton
|
| 442 |
+
|
| 443 |
+
def run_and_submit_all(sample_size: int = 0, max_retries: int = 1, use_local_judge_to_select: bool = True):
|
| 444 |
"""
|
| 445 |
Fetches all questions, runs the agent on them, judges locally (if gold available),
|
| 446 |
+
optionally reattempts on incorrect results, submits answers, and returns:
|
| 447 |
+
- final status string
|
| 448 |
+
- final results dataframe (one row per question)
|
| 449 |
+
- attempt log dataframe (one row per attempt)
|
| 450 |
"""
|
| 451 |
username = "Gralon"
|
| 452 |
print(f"Using username: {username}")
|
|
|
|
| 460 |
agent = GAIAAgent(verbose=True)
|
| 461 |
except Exception as e:
|
| 462 |
print(f"Error instantiating agent: {e}")
|
| 463 |
+
return f"Error initializing agent: {e}", None, None
|
| 464 |
|
| 465 |
# 1b. Init JudgeAgent once, reusing the SAME model instance
|
| 466 |
+
judge_agent = _ensure_judge(agent.agent.model)
|
|
|
|
|
|
|
| 467 |
|
| 468 |
# Derive code URL for submission
|
| 469 |
space_id = os.getenv("SPACE_ID")
|
| 470 |
+
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "local"
|
|
|
|
|
|
|
|
|
|
| 471 |
|
| 472 |
# 2. Fetch Questions
|
| 473 |
print(f"Fetching questions from: {questions_url}")
|
|
|
|
| 477 |
questions_data = response.json()
|
| 478 |
if not questions_data:
|
| 479 |
print("Fetched questions list is empty.")
|
| 480 |
+
return "Fetched questions list is empty or invalid format.", None, None
|
| 481 |
print(f"Fetched {len(questions_data)} questions.")
|
| 482 |
except requests.exceptions.RequestException as e:
|
| 483 |
print(f"Error fetching questions: {e}")
|
| 484 |
+
return f"Error fetching questions: {e}", None, None
|
| 485 |
except json.JSONDecodeError as e:
|
| 486 |
print(f"Error decoding JSON response from questions endpoint: {e}")
|
| 487 |
print(f"Response text: {response.text[:500]}")
|
| 488 |
+
return f"Error decoding server response for questions: {e}", None, None
|
| 489 |
except Exception as e:
|
| 490 |
print(f"An unexpected error occurred fetching questions: {e}")
|
| 491 |
+
return f"An unexpected error occurred fetching questions: {e}", None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
| 492 |
|
| 493 |
+
# Sampling
|
| 494 |
if sample_size > 0 and sample_size < len(questions_data):
|
| 495 |
import random
|
| 496 |
print(f"Using a sample of {sample_size} questions from {len(questions_data)} total questions")
|
| 497 |
questions_data = random.sample(questions_data, sample_size)
|
| 498 |
|
| 499 |
print(f"Running agent on {len(questions_data)} questions...")
|
| 500 |
+
results_log: List[Dict[str, Any]] = []
|
| 501 |
+
attempts_log: List[Dict[str, Any]] = []
|
| 502 |
+
answers_payload: List[Dict[str, Any]] = []
|
| 503 |
+
|
| 504 |
for i, item in enumerate(questions_data):
|
| 505 |
task_id = item.get("task_id")
|
| 506 |
question_text = item.get("question")
|
| 507 |
if not task_id or question_text is None:
|
| 508 |
print(f"Skipping item with missing task_id or question: {item}")
|
| 509 |
continue
|
| 510 |
+
|
| 511 |
+
gold = gold_answers.by_task_id.get(task_id)
|
| 512 |
+
per_question_attempts: List[Dict[str, Any]] = []
|
| 513 |
+
|
| 514 |
try:
|
| 515 |
print(f"Processing question {i+1}/{len(questions_data)}: Task ID {task_id}")
|
| 516 |
+
|
| 517 |
+
# -- First attempt
|
| 518 |
+
raw = agent(question_text, allow_extra_searches=False)
|
| 519 |
+
ans = extract_final_answer(raw)
|
| 520 |
+
fixed = quick_format_fix(ans, question_text) or ans
|
| 521 |
+
|
| 522 |
+
# judge first (on fixed)
|
| 523 |
+
jres = None
|
| 524 |
+
j_is_correct = None
|
| 525 |
+
j_score = None
|
| 526 |
+
j_note = None
|
| 527 |
if gold:
|
| 528 |
+
jres = judge_agent.judge(question_text, fixed, gold)
|
| 529 |
+
j_is_correct = jres.get("is_correct")
|
| 530 |
+
j_score = jres.get("score")
|
| 531 |
+
j_note = jres.get("justification")
|
| 532 |
+
|
| 533 |
+
per_question_attempts.append({
|
| 534 |
+
"Task ID": task_id,
|
| 535 |
+
"Attempt": 1,
|
| 536 |
+
"Submitted Answer (raw)": ans,
|
| 537 |
+
"Submitted Answer (fixed)": fixed,
|
| 538 |
+
"Judge Correct?": j_is_correct,
|
| 539 |
+
"Judge Score": j_score,
|
| 540 |
+
"Judge Note": j_note
|
| 541 |
+
})
|
| 542 |
|
| 543 |
+
best_answer = fixed
|
| 544 |
+
best_score = j_score if j_score is not None else 0.0
|
| 545 |
+
best_correct = j_is_correct
|
| 546 |
+
|
| 547 |
+
retries = 0
|
| 548 |
+
while (j_is_correct is False) and (retries < max_retries):
|
| 549 |
+
retries += 1
|
| 550 |
+
|
| 551 |
+
# Try reflective retry
|
| 552 |
+
refined_raw = agent.refine(
|
| 553 |
+
question=question_text,
|
| 554 |
+
prev_answer=fixed,
|
| 555 |
+
judge_feedback=j_note or "Format/content mismatch.",
|
| 556 |
+
attempt_no=retries
|
| 557 |
+
)
|
| 558 |
+
refined = extract_final_answer(refined_raw)
|
| 559 |
+
refined_fixed = quick_format_fix(refined, question_text) or refined
|
| 560 |
+
|
| 561 |
+
# Judge the refined answer
|
| 562 |
+
j2 = None
|
| 563 |
+
j2_is_correct = None
|
| 564 |
+
j2_score = None
|
| 565 |
+
j2_note = None
|
| 566 |
+
if gold:
|
| 567 |
+
j2 = judge_agent.judge(question_text, refined_fixed, gold)
|
| 568 |
+
j2_is_correct = j2.get("is_correct")
|
| 569 |
+
j2_score = j2.get("score")
|
| 570 |
+
j2_note = j2.get("justification")
|
| 571 |
+
|
| 572 |
+
per_question_attempts.append({
|
| 573 |
+
"Task ID": task_id,
|
| 574 |
+
"Attempt": retries + 1,
|
| 575 |
+
"Submitted Answer (raw)": refined,
|
| 576 |
+
"Submitted Answer (fixed)": refined_fixed,
|
| 577 |
+
"Judge Correct?": j2_is_correct,
|
| 578 |
+
"Judge Score": j2_score,
|
| 579 |
+
"Judge Note": j2_note
|
| 580 |
+
})
|
| 581 |
+
|
| 582 |
+
# Decide whether to keep this as best
|
| 583 |
+
if use_local_judge_to_select and gold and (j2_score is not None):
|
| 584 |
+
if (j2_score > (best_score or 0)) or (best_score is None):
|
| 585 |
+
best_answer, best_score, best_correct = refined_fixed, j2_score, j2_is_correct
|
| 586 |
+
else:
|
| 587 |
+
# If we don't have gold/judge, prefer the newest answer
|
| 588 |
+
best_answer = refined_fixed
|
| 589 |
+
best_score = j2_score if j2_score is not None else best_score
|
| 590 |
+
best_correct = j2_is_correct if j2_is_correct is not None else best_correct
|
| 591 |
+
|
| 592 |
+
# Prepare for another retry if needed
|
| 593 |
+
fixed = refined_fixed
|
| 594 |
+
j_is_correct = j2_is_correct
|
| 595 |
+
j_score = j2_score
|
| 596 |
+
j_note = j2_note
|
| 597 |
+
|
| 598 |
+
if j2_is_correct:
|
| 599 |
+
break
|
| 600 |
+
|
| 601 |
+
if retries < max_retries:
|
| 602 |
+
print("Waiting 2 seconds before next attempt...")
|
| 603 |
+
time.sleep(2)
|
| 604 |
+
|
| 605 |
+
# Append final choice per question
|
| 606 |
+
answers_payload.append({"task_id": task_id, "submitted_answer": best_answer})
|
| 607 |
results_log.append({
|
| 608 |
"Task ID": task_id,
|
| 609 |
"Question": question_text,
|
| 610 |
+
"Submitted Answer": best_answer,
|
| 611 |
"Gold (local)": gold if gold else "",
|
| 612 |
+
"Judge Correct?": best_correct,
|
| 613 |
+
"Judge Score": best_score,
|
| 614 |
+
"Judge Note": j_note
|
| 615 |
})
|
| 616 |
+
print(f"Finished question {i+1}")
|
| 617 |
+
|
| 618 |
+
# Add to global attempts log
|
| 619 |
+
attempts_log.extend(per_question_attempts)
|
| 620 |
|
| 621 |
if i < len(questions_data) - 1:
|
| 622 |
print("Waiting 2 seconds before next question...")
|
|
|
|
| 636 |
|
| 637 |
if not answers_payload:
|
| 638 |
print("Agent did not produce any answers to submit.")
|
| 639 |
+
return "Agent did not produce any answers to submit.", pd.DataFrame(results_log), pd.DataFrame(attempts_log)
|
| 640 |
|
| 641 |
# 4. Prepare Submission
|
| 642 |
submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
|
|
|
|
| 658 |
)
|
| 659 |
print("Submission successful.")
|
| 660 |
results_df = pd.DataFrame(results_log)
|
| 661 |
+
attempts_df = pd.DataFrame(attempts_log)
|
| 662 |
+
return final_status, results_df, attempts_df
|
| 663 |
except requests.exceptions.HTTPError as e:
|
| 664 |
error_detail = f"Server responded with status {e.response.status_code}."
|
| 665 |
try:
|
|
|
|
| 670 |
status_message = f"Submission Failed: {error_detail}"
|
| 671 |
print(status_message)
|
| 672 |
results_df = pd.DataFrame(results_log)
|
| 673 |
+
attempts_df = pd.DataFrame(attempts_log)
|
| 674 |
+
return status_message, results_df, attempts_df
|
| 675 |
except requests.exceptions.Timeout:
|
| 676 |
status_message = "Submission Failed: The request timed out."
|
| 677 |
print(status_message)
|
| 678 |
results_df = pd.DataFrame(results_log)
|
| 679 |
+
attempts_df = pd.DataFrame(attempts_log)
|
| 680 |
+
return status_message, results_df, attempts_df
|
| 681 |
except requests.exceptions.RequestException as e:
|
| 682 |
status_message = f"Submission Failed: Network error - {e}"
|
| 683 |
print(status_message)
|
| 684 |
results_df = pd.DataFrame(results_log)
|
| 685 |
+
attempts_df = pd.DataFrame(attempts_log)
|
| 686 |
+
return status_message, results_df, attempts_df
|
| 687 |
except Exception as e:
|
| 688 |
status_message = f"An unexpected error occurred during submission: {e}"
|
| 689 |
print(status_message)
|
| 690 |
results_df = pd.DataFrame(results_log)
|
| 691 |
+
attempts_df = pd.DataFrame(attempts_log)
|
| 692 |
+
return status_message, results_df, attempts_df
|
| 693 |
|
| 694 |
+
def test_single_question(question: str, retries: int = 1) -> str:
|
| 695 |
+
"""Test the agent on a single question (no submission), with judge-aware retries if gold exists."""
|
| 696 |
try:
|
| 697 |
agent = GAIAAgent(verbose=True)
|
| 698 |
+
judge_agent = _ensure_judge(agent.agent.model)
|
| 699 |
+
gold = None # No task_id context here; pure test (no gold)
|
| 700 |
+
# First attempt
|
| 701 |
+
raw = agent(question)
|
| 702 |
+
ans = extract_final_answer(raw)
|
| 703 |
+
fixed = quick_format_fix(ans, question) or ans
|
| 704 |
+
|
| 705 |
+
if retries <= 0:
|
| 706 |
+
return fixed
|
| 707 |
+
|
| 708 |
+
# Without gold we can't know correctness; just do a reflective retry once for demo
|
| 709 |
+
last = fixed
|
| 710 |
+
note = "Possible format/content mismatch; re-evaluate."
|
| 711 |
+
for k in range(retries):
|
| 712 |
+
refined_raw = agent.refine(question, prev_answer=last, judge_feedback=note, attempt_no=k+1)
|
| 713 |
+
refined = extract_final_answer(refined_raw)
|
| 714 |
+
refined_fixed = quick_format_fix(refined, question) or refined
|
| 715 |
+
last = refined_fixed
|
| 716 |
+
return last
|
| 717 |
except Exception as e:
|
| 718 |
return f"Error: {e}"
|
| 719 |
|
|
|
|
| 722 |
# try task_id lookup first
|
| 723 |
gold = gold_answers.by_task_id.get(task_id_or_gold, task_id_or_gold)
|
| 724 |
agent = GAIAAgent(verbose=False)
|
| 725 |
+
judge_agent = _ensure_judge(agent.agent.model)
|
| 726 |
+
res = judge_agent.judge(question, predicted, gold)
|
|
|
|
|
|
|
| 727 |
out = {
|
| 728 |
"Gold": gold,
|
| 729 |
"is_correct": res["is_correct"],
|
|
|
|
| 734 |
|
| 735 |
# --- Build Gradio Interface using Blocks ---
|
| 736 |
with gr.Blocks() as demo:
|
| 737 |
+
gr.Markdown("# GAIA Agent Evaluation Runner + Local LLM Judge (with smart retries)")
|
| 738 |
gr.Markdown(
|
| 739 |
"""
|
| 740 |
## Instructions:
|
|
|
|
| 744 |
3. Run the full evaluation on the GAIA benchmark in the Evaluation tab
|
| 745 |
|
| 746 |
This agent runs locally, uses an LLM judge against your answers.csv (if present),
|
| 747 |
+
**retries intelligently** when the judge says 'incorrect', and then submits answers to the server.
|
| 748 |
"""
|
| 749 |
)
|
| 750 |
|
|
|
|
| 752 |
|
| 753 |
with gr.Tab("Test Single Question"):
|
| 754 |
test_input = gr.Textbox(label="Enter a question to test", lines=3)
|
| 755 |
+
test_retries = gr.Slider(minimum=0, maximum=3, value=1, step=1, label="Retries (no gold here, heuristic only)")
|
| 756 |
test_output = gr.Textbox(label="Answer", lines=3)
|
| 757 |
test_button = gr.Button("Test Question")
|
| 758 |
|
| 759 |
test_button.click(
|
| 760 |
fn=test_single_question,
|
| 761 |
+
inputs=[test_input, test_retries],
|
| 762 |
outputs=test_output
|
| 763 |
)
|
| 764 |
|
|
|
|
| 779 |
label="Sample Size (0 for all questions)",
|
| 780 |
info="Set a number to limit how many questions to process (reduces costs)"
|
| 781 |
)
|
| 782 |
+
max_retries = gr.Slider(
|
| 783 |
+
minimum=0,
|
| 784 |
+
maximum=3,
|
| 785 |
+
value=1,
|
| 786 |
+
step=1,
|
| 787 |
+
label="Max judge-driven retries per question",
|
| 788 |
+
info="0 = no retries; 1-3 = progressively more effort"
|
| 789 |
+
)
|
| 790 |
+
use_local = gr.Checkbox(
|
| 791 |
+
value=True,
|
| 792 |
+
label="Use local judge (gold) to pick best attempt when available",
|
| 793 |
+
info="If unchecked, we submit the last attempt instead."
|
| 794 |
+
)
|
| 795 |
|
| 796 |
+
run_button = gr.Button("Run Evaluation, Judge Locally, Retry & Submit")
|
| 797 |
status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
|
| 798 |
+
results_table = gr.DataFrame(label="Final Results (per question)", wrap=True)
|
| 799 |
+
attempts_table = gr.DataFrame(label="Attempt Log (expanded)", wrap=True)
|
| 800 |
|
| 801 |
run_button.click(
|
| 802 |
fn=run_and_submit_all,
|
| 803 |
+
inputs=[sample_size, max_retries, use_local],
|
| 804 |
+
outputs=[status_output, results_table, attempts_table]
|
| 805 |
)
|
| 806 |
|
| 807 |
if __name__ == "__main__":
|