FinalTest

Runtime error

App Files Files Community

yoshizen commited on May 28, 2025

Commit

7999c2e

verified ·

1 Parent(s): 90346c1

Update gaia_agent.py

Browse files

Files changed (1) hide show

gaia_agent.py +31 -13

gaia_agent.py CHANGED Viewed

@@ -73,7 +73,7 @@ class EnhancedGAIAAgent:
             task_id: Optional task ID for the GAIA benchmark
         Returns:
-            Plain string with the answer (not JSON)
         """
         print(f"Processing question: {question}")
@@ -87,8 +87,12 @@ class EnhancedGAIAAgent:
         # Ensure answer is concise and specific
         model_answer = self._ensure_concise_answer(model_answer, question_type)
-        # FIXED: Return only the plain string answer, not JSON
-        return model_answer
     def _generate_reasoning_trace(self, question: str, question_type: str) -> str:
         """Generate a reasoning trace for the question if appropriate."""
@@ -537,10 +541,15 @@ class EvaluationRunner:
                 continue
             try:
-                # FIXED: Call agent and get plain string answer
-                submitted_answer = agent(question_text, task_id)
-                # FIXED: No need to parse JSON, just use the answer directly
                 answers_payload.append({
                     "task_id": task_id,
                     "submitted_answer": submitted_answer
@@ -549,7 +558,8 @@ class EvaluationRunner:
                 results_log.append({
                     "Task ID": task_id,
                     "Question": question_text,
-                    "Submitted Answer": submitted_answer
                 })
             except Exception as e:
                 print(f"Error running agent on task {task_id}: {e}")
@@ -704,15 +714,23 @@ def test_agent():
         # Generate a mock task_id for testing
         task_id = f"test_{hash(question) % 10000}"
-        # Get plain string answer
-        answer = agent(question, task_id)
         print(f"\nQ: {question}")
-        print(f"A: {answer}")
-        # For testing purposes, simulate correct answers
-        if len(answer) > 0 and not answer.startswith("AGENT ERROR"):
-            correct_count += 1
     # Print test summary with correct answer count
     print("\n===== TEST SUMMARY =====")

             task_id: Optional task ID for the GAIA benchmark
         Returns:
+            JSON string with final_answer key
         """
         print(f"Processing question: {question}")
         # Ensure answer is concise and specific
         model_answer = self._ensure_concise_answer(model_answer, question_type)
+        # FIXED: Return JSON with final_answer key
+        response = {
+            "final_answer": model_answer
+        }
+        return json.dumps(response)
     def _generate_reasoning_trace(self, question: str, question_type: str) -> str:
         """Generate a reasoning trace for the question if appropriate."""
                 continue
             try:
+                # Call agent with task_id to ensure proper formatting
+                json_response = agent(question_text, task_id)
+                # Parse the JSON response
+                response_obj = json.loads(json_response)
+                # Extract the final_answer for submission
+                submitted_answer = response_obj.get("final_answer", "")
                 answers_payload.append({
                     "task_id": task_id,
                     "submitted_answer": submitted_answer
                 results_log.append({
                     "Task ID": task_id,
                     "Question": question_text,
+                    "Submitted Answer": submitted_answer,
+                    "Full Response": json_response
                 })
             except Exception as e:
                 print(f"Error running agent on task {task_id}: {e}")
         # Generate a mock task_id for testing
         task_id = f"test_{hash(question) % 10000}"
+        # Get JSON response with final_answer
+        json_response = agent(question, task_id)
         print(f"\nQ: {question}")
+        print(f"Response: {json_response}")
+        # Parse and print the final_answer for clarity
+        try:
+            response_obj = json.loads(json_response)
+            final_answer = response_obj.get('final_answer', '')
+            print(f"Final Answer: {final_answer}")
+            # For testing purposes, simulate correct answers
+            if len(final_answer) > 0 and not final_answer.startswith("AGENT ERROR"):
+                correct_count += 1
+        except:
+            print("Error parsing JSON response")
     # Print test summary with correct answer count
     print("\n===== TEST SUMMARY =====")