Spaces:

mabelwang21
/

Agents_Final_Assignment

Sleeping

App Files Files Community

mabelwang21 commited on May 26, 2025

Commit

3cc0589

1 Parent(s): 2c6f69a

quick check accuracy

Browse files

Files changed (1) hide show

test_agent.py +49 -15

test_agent.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import json
 from pathlib import Path
-from agent import MyAgent
 def test_agent(
@@ -8,11 +8,17 @@ def test_agent(
     max_tests: int = 5,
 ):
     """
-    Load up to max_tests questions from the GAIA metadata JSONL file
-    and run them through MyAgent for a quick functionality check.
     """
-    # Initialize agent
-    agent = MyAgent()
     metadata_file = Path(metadata_path)
     if not metadata_file.exists():
@@ -29,10 +35,9 @@ def test_agent(
                 print(f"Invalid JSON on line {i+1}")
                 continue
-            # Support both 'task_id' and 'id'
             task_id = meta.get("task_id") or meta.get("id") or ""
-            # Support both 'question' and 'text'
             question = meta.get("Question") or meta.get("text") or ""
             print(f"--- Test {i+1}/{max_tests}: Task ID {task_id} ---")
             print(f"Question: {question}")
@@ -42,19 +47,48 @@ def test_agent(
                 continue
             try:
-                # If there's a file_name field, pass it to agent.run
-                file_arg = None
-                if meta.get("file_name"):
-                    file_arg = meta.get("file_name")
-                # Call agent with question and optional file
                 if file_arg:
-                    answer = agent.run(question, file_paths=[file_arg])
                 else:
-                    answer = agent.run(question)
-                print(f"Answer: {answer}\n")
             except Exception as e:
                 print(f"Error running agent on question '{question}': {e}\n")
 if __name__ == "__main__":
     import argparse

 import json
 from pathlib import Path
+from agent3 import MyAgent
 def test_agent(
     max_tests: int = 5,
 ):
     """
+    Load up to max_tests questions from the GAIA metadata JSONL file,
+    run them through MyAgent, and compare with the correct answer.
     """
+    try:
+        agent = MyAgent()
+    except Exception as e:
+        print(f"Error initializing agent: {e}")
+        return
+    correct_count = 0
+    total_count = 0
     metadata_file = Path(metadata_path)
     if not metadata_file.exists():
                 print(f"Invalid JSON on line {i+1}")
                 continue
             task_id = meta.get("task_id") or meta.get("id") or ""
             question = meta.get("Question") or meta.get("text") or ""
+            correct_answer = meta.get("Final answer") or meta.get("final answer") or meta.get("Answer") or ""
             print(f"--- Test {i+1}/{max_tests}: Task ID {task_id} ---")
             print(f"Question: {question}")
                 continue
             try:
+                file_arg = meta.get("file_name")
                 if file_arg:
+                    try:
+                        answer = agent.run(question, file_paths=[file_arg])
+                    except Exception as e:
+                        import traceback
+                        print(f"Error running agent with file: {e}")
+                        print(traceback.format_exc())
+                        continue
+                else:
+                    try:
+                        answer = agent.run(question)
+                    except Exception as e:
+                        import traceback
+                        print(f"Error running agent: {e}")
+                        print(traceback.format_exc())
+                        continue
+                print(f"Agent Answer: {answer}")
+                print(f"Correct Answer: {correct_answer}")
+                # Normalize for comparison
+                def normalize(s):
+                    return str(s).strip().lower()
+                if normalize(answer) == normalize(correct_answer):
+                    print("✅ MATCH\n")
+                    correct_count += 1
                 else:
+                    print("❌ NO MATCH\n")
+                total_count += 1
             except Exception as e:
                 print(f"Error running agent on question '{question}': {e}\n")
+    print(f"=== Final Results ===")
+    print(f"Total Tests: {total_count}")
+    print(f"Correct Answers: {correct_count}")
+    if total_count > 0:
+        print(f"Accuracy: {correct_count / total_count * 100:.2f}%")
+    else:
+        print("No valid tests run.")
 if __name__ == "__main__":
     import argparse