Spaces:
Runtime error
Runtime error
| def test_questions(): | |
| return [ | |
| { | |
| "task_id": "q4", | |
| "question": ( | |
| "Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, " | |
| "but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it" | |
| ), | |
| "expected_keywords": ["clarify", "incomplete", "missing", "please provide", "need more information"] | |
| }, | |
| { | |
| "task_id": "q7", | |
| "question": ( | |
| "Given this table defining * on the set S = {a, b, c, d, e}\n\n" | |
| "|*|a|b|c|d|e|\n" | |
| "|---|---|---|---|---|---|\n" | |
| "|a|a|b|c|b|d|\n" | |
| "|b|b|c|a|e|c|\n" | |
| "|c|c|a|b|b|a|\n" | |
| "|d|b|e|b|e|d|\n" | |
| "|e|d|b|a|d|c|\n\n" | |
| "Provide the subset of S involved in any possible counter-examples that prove * is not commutative. " | |
| "Provide your answer as a comma-separated list of the elements in the set in alphabetical order." | |
| ), | |
| "expected_keywords": ["b, e"] | |
| }, | |
| { | |
| "task_id": "q3", | |
| "question": ( | |
| "'.rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI'" | |
| ), | |
| "expected_keywords": ["right"] | |
| }, | |
| { | |
| "task_id": "q2", | |
| "question": ( | |
| "How many studio albums did Mercedes Sosa release between 2000 and 2009 (inclusive)? " | |
| "Use Wikipedia to find the answer." | |
| ), | |
| "expected_keywords": ["3", "three"] | |
| }, | |
| { | |
| "task_id": "q4b", | |
| "question": ( | |
| "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?" | |
| ), | |
| "expected_keywords": ["FunkMonk"] | |
| }, | |
| { | |
| "task_id": "q5", | |
| "question": ( | |
| "Who is the CEO of OpenAI?" | |
| ), | |
| "expected_keywords": ["sam altman"] | |
| }, | |
| { | |
| "task_id": "q6", | |
| "question": ( | |
| "When was the Eiffel Tower built?" | |
| ), | |
| "expected_keywords": ["1889"] | |
| } | |
| ] | |
| def evaluate_agent(agent, questions): | |
| print("\n\n========= Running GAIA Evaluation =========\n") | |
| correct = 0 | |
| total = len(questions) | |
| logs = [] | |
| for q in questions: | |
| print(f"π¨ Q: {q['question']}") | |
| try: | |
| answer = agent(q["question"]) | |
| print(f"π© A: {answer}\n") | |
| matched = any(keyword.lower() in answer.lower() for keyword in q["expected_keywords"]) | |
| expected = ", ".join(q["expected_keywords"]) | |
| result = "β Correct" if matched else "β Incorrect" | |
| print(f"{result} β Expected one of: [{expected}]\n") | |
| if matched: | |
| correct += 1 | |
| except Exception as e: | |
| print(f"π₯ Error: {e}\n") | |
| result = f"π₯ Error: {e}" | |
| logs.append({ | |
| "question": q["question"], | |
| "expected_keywords": q["expected_keywords"], | |
| "result": result | |
| }) | |
| score_report = f"β Score: {correct} / {total} correct" | |
| print(score_report + "\n") | |
| return score_report | |