Spaces:
Runtime error
Runtime error
Yago Bolivar commited on
Commit ·
b121170
1
Parent(s): 4d7d7f8
feat: add evaluation and submission utilities for GAIA project
Browse files
utilities/{compare_questions.py → compare_question_set.py}
RENAMED
|
File without changes
|
utilities/evaluate_local.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import argparse
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
# TEST WITH
|
| 6 |
+
# python3 utilities/evaluate_local.py --answers_file ./question_set/agent_answers.json
|
| 7 |
+
|
| 8 |
+
def load_json(filepath):
|
| 9 |
+
"""Loads JSON data from a file."""
|
| 10 |
+
try:
|
| 11 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 12 |
+
return json.load(f)
|
| 13 |
+
except FileNotFoundError:
|
| 14 |
+
print(f"Error: File not found at {filepath}")
|
| 15 |
+
return None
|
| 16 |
+
except json.JSONDecodeError:
|
| 17 |
+
print(f"Error: Could not decode JSON from {filepath}")
|
| 18 |
+
return None
|
| 19 |
+
|
| 20 |
+
def evaluate_answers(questions_data, agent_answers_data, level_filter=None):
|
| 21 |
+
"""
|
| 22 |
+
Evaluates agent answers against ground truth.
|
| 23 |
+
|
| 24 |
+
Args:
|
| 25 |
+
questions_data (dict): Dictionary mapping task_id to question details including 'Final Answer'.
|
| 26 |
+
agent_answers_data (list): List of dictionaries with 'task_id' and 'submitted_answer'.
|
| 27 |
+
level_filter (int, optional): Filter evaluation to only this GAIA level. Defaults to None.
|
| 28 |
+
|
| 29 |
+
Returns:
|
| 30 |
+
tuple: (accuracy, correct_count, total_evaluated, incorrect_details)
|
| 31 |
+
incorrect_details is a list of tuples: (task_id, expected, got)
|
| 32 |
+
"""
|
| 33 |
+
correct_count = 0
|
| 34 |
+
total_evaluated = 0
|
| 35 |
+
incorrect_details = []
|
| 36 |
+
agent_answers_map = {item['task_id']: item['submitted_answer'] for item in agent_answers_data}
|
| 37 |
+
|
| 38 |
+
for task_id, question_info in questions_data.items():
|
| 39 |
+
# Apply level filter if specified
|
| 40 |
+
if level_filter is not None and question_info.get('Level') != level_filter:
|
| 41 |
+
continue
|
| 42 |
+
|
| 43 |
+
if task_id in agent_answers_map:
|
| 44 |
+
total_evaluated += 1
|
| 45 |
+
expected_answer = question_info.get('Final Answer')
|
| 46 |
+
submitted_answer = agent_answers_map[task_id]
|
| 47 |
+
|
| 48 |
+
# GAIA uses exact match
|
| 49 |
+
if str(submitted_answer) == str(expected_answer):
|
| 50 |
+
correct_count += 1
|
| 51 |
+
else:
|
| 52 |
+
incorrect_details.append((task_id, expected_answer, submitted_answer))
|
| 53 |
+
# else:
|
| 54 |
+
# print(f"Warning: No submitted answer found for task_id {task_id}") # Optional warning
|
| 55 |
+
|
| 56 |
+
accuracy = (correct_count / total_evaluated) * 100 if total_evaluated > 0 else 0
|
| 57 |
+
return accuracy, correct_count, total_evaluated, incorrect_details
|
| 58 |
+
|
| 59 |
+
def main():
|
| 60 |
+
parser = argparse.ArgumentParser(description="Evaluate agent answers locally against GAIA ground truth.")
|
| 61 |
+
parser.add_argument(
|
| 62 |
+
"--questions_file",
|
| 63 |
+
type=str,
|
| 64 |
+
default="../question_set/new_gaia_questions.json", # Adjusted default path
|
| 65 |
+
help="Path to the JSON file containing GAIA questions and answers."
|
| 66 |
+
)
|
| 67 |
+
parser.add_argument(
|
| 68 |
+
"--answers_file",
|
| 69 |
+
type=str,
|
| 70 |
+
required=True,
|
| 71 |
+
help="Path to the JSON file containing the agent's submitted answers."
|
| 72 |
+
)
|
| 73 |
+
parser.add_argument(
|
| 74 |
+
"--level",
|
| 75 |
+
type=int,
|
| 76 |
+
choices=[1, 2, 3],
|
| 77 |
+
default=None, # Default is None, meaning evaluate all levels
|
| 78 |
+
help="Specify the GAIA level (1, 2, or 3) to evaluate. Evaluates all levels if not specified."
|
| 79 |
+
)
|
| 80 |
+
parser.add_argument(
|
| 81 |
+
"--verbose",
|
| 82 |
+
action='store_true', # Add verbose flag
|
| 83 |
+
help="Print details of incorrect answers."
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
args = parser.parse_args()
|
| 88 |
+
|
| 89 |
+
# Construct absolute paths relative to the script location
|
| 90 |
+
script_dir = os.path.dirname(__file__)
|
| 91 |
+
questions_filepath = os.path.abspath(os.path.join(script_dir, args.questions_file))
|
| 92 |
+
answers_filepath = os.path.abspath(os.path.join(script_dir, '..', args.answers_file)) # Assume answers file is in root relative to script in utilities
|
| 93 |
+
|
| 94 |
+
print(f"Loading questions from: {questions_filepath}")
|
| 95 |
+
questions_data = load_json(questions_filepath)
|
| 96 |
+
if questions_data is None:
|
| 97 |
+
return
|
| 98 |
+
|
| 99 |
+
print(f"Loading agent answers from: {answers_filepath}")
|
| 100 |
+
agent_answers_data = load_json(answers_filepath)
|
| 101 |
+
if agent_answers_data is None:
|
| 102 |
+
return
|
| 103 |
+
|
| 104 |
+
# Ensure agent_answers_data is a list
|
| 105 |
+
if not isinstance(agent_answers_data, list):
|
| 106 |
+
print(f"Error: Agent answers file ({args.answers_file}) should contain a JSON list.")
|
| 107 |
+
# Attempt to load if it's a dict containing a list (common mistake)
|
| 108 |
+
if isinstance(agent_answers_data, dict) and 'answers' in agent_answers_data and isinstance(agent_answers_data['answers'], list):
|
| 109 |
+
agent_answers_data = agent_answers_data['answers']
|
| 110 |
+
print("Note: Loaded answers from the 'answers' key in the JSON object.")
|
| 111 |
+
else:
|
| 112 |
+
return
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
level_str = f"Level {args.level}" if args.level else "All Levels"
|
| 116 |
+
print(f"\nEvaluating answers for: {level_str}")
|
| 117 |
+
|
| 118 |
+
accuracy, correct_count, total_evaluated, incorrect_details = evaluate_answers(
|
| 119 |
+
questions_data, agent_answers_data, args.level
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
if total_evaluated == 0:
|
| 123 |
+
print("No answers found for the specified criteria.")
|
| 124 |
+
else:
|
| 125 |
+
print("\n--- Evaluation Results ---")
|
| 126 |
+
print(f"Level Filter: {level_str}")
|
| 127 |
+
print(f"Total Questions Evaluated: {total_evaluated}")
|
| 128 |
+
print(f"Correct Answers: {correct_count}")
|
| 129 |
+
print(f"Accuracy: {accuracy:.2f}%")
|
| 130 |
+
|
| 131 |
+
if args.verbose and incorrect_details:
|
| 132 |
+
print("\n--- Incorrect Answers ---")
|
| 133 |
+
for task_id, expected, got in incorrect_details:
|
| 134 |
+
print(f" Task ID: {task_id}")
|
| 135 |
+
print(f" Expected: {expected}")
|
| 136 |
+
print(f" Got: {got}")
|
| 137 |
+
print("------------------------")
|
| 138 |
+
|
| 139 |
+
if __name__ == "__main__":
|
| 140 |
+
main()
|
utilities/evaluate_local_commands.md
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
**Run the Evaluation Script:** Open your terminal, navigate to the `utilities` directory, and run the script:
|
| 2 |
+
|
| 3 |
+
* **Evaluate all levels:**
|
| 4 |
+
```bash
|
| 5 |
+
cd /Users/yagoairm2/Desktop/agents/final\ project/HF_Agents_Final_Project/utilities
|
| 6 |
+
python evaluate_local.py --answers_file ../agent_answers.json
|
| 7 |
+
```
|
| 8 |
+
* **Evaluate only Level 1:**
|
| 9 |
+
```bash
|
| 10 |
+
python evaluate_local.py --answers_file ../agent_answers.json --level 1
|
| 11 |
+
```
|
| 12 |
+
* **Evaluate Level 1 and show incorrect answers:**
|
| 13 |
+
```bash
|
| 14 |
+
python evaluate_local.py --answers_file ../agent_answers.json --level 1 --verbose
|
| 15 |
+
```
|
| 16 |
+
|
| 17 |
+
This script will calculate and print the accuracy based on the exact match criterion used by GAIA, without submitting anything to the official leaderboard.
|
utilities/{random_question_answer.py → random_question_submit.py}
RENAMED
|
File without changes
|