Final_Assignment_Template

Sleeping

App Files Files Community

Denis Davydov commited on Jul 9, 2025

Commit

2e79a34

1 Parent(s): a5c9e62

fix submitted_answer

Browse files

Files changed (2) hide show

app.py +2 -2
test_agent_format.py +0 -99

app.py CHANGED Viewed

@@ -74,7 +74,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
             submitted_answer, reasoning_trace = agent(question_text, task_id)
             answers_payload.append({
                 "task_id": task_id,
-                "model_answer": submitted_answer,
                 "reasoning_trace": reasoning_trace
             })
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
@@ -83,7 +83,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
              error_answer = f"AGENT ERROR: {e}"
              answers_payload.append({
                 "task_id": task_id,
-                "model_answer": error_answer,
                 "reasoning_trace": f"Error occurred: {str(e)}"
             })
              results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": error_answer})

             submitted_answer, reasoning_trace = agent(question_text, task_id)
             answers_payload.append({
                 "task_id": task_id,
+                "submitted_answer": submitted_answer,
                 "reasoning_trace": reasoning_trace
             })
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
              error_answer = f"AGENT ERROR: {e}"
              answers_payload.append({
                 "task_id": task_id,
+                "submitted_answer": error_answer,
                 "reasoning_trace": f"Error occurred: {str(e)}"
             })
              results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": error_answer})

test_agent_format.py DELETED Viewed

@@ -1,99 +0,0 @@
-#!/usr/bin/env python3
-"""
-Test script to verify the agent's answer formatting works correctly.
-"""
-import os
-from agent import smart_agent
-from utils import format_gaia_answer
-def test_answer_formatting():
-    """Test the answer formatting function with various inputs."""
-    test_cases = [
-        # Test case: (raw_answer, expected_format)
-        ("I think the answer is 42. FINAL ANSWER: 42", "42"),
-        ("Let me calculate... FINAL ANSWER: 3.14159", "3.14159"),
-        ("After research, FINAL ANSWER: New York", "New York"),
-        ("The result is FINAL ANSWER: apple, banana, cherry", "apple, banana, cherry"),
-        ("FINAL ANSWER: 1,234", "1234"),  # Should remove commas from numbers
-        ("FINAL ANSWER: \"Hello World\"", "Hello World"),  # Should remove quotes
-        ("FINAL ANSWER: approximately 100", "100"),  # Should remove qualifiers
-        ("No clear final answer format here", "No clear final answer format here"),  # Fallback
-    ]
-    print("🧪 Testing answer formatting...")
-    for i, (raw, expected) in enumerate(test_cases, 1):
-        result = format_gaia_answer(raw)
-        status = "✅" if result == expected else "❌"
-        print(f"{status} Test {i}: '{raw}' -> '{result}' (expected: '{expected}')")
-        if result != expected:
-            print(f"   ⚠️  Mismatch detected!")
-    print("\n" + "="*50)
-def test_simple_question():
-    """Test the agent with a simple question."""
-    print("🤖 Testing agent with a simple question...")
-    question = "What is 2 + 2?"
-    try:
-        answer, reasoning = smart_agent(question)
-        print(f"Question: {question}")
-        print(f"Answer: {answer}")
-        print(f"Reasoning length: {len(reasoning)} characters")
-        print(f"Raw reasoning preview: {reasoning[:200]}...")
-        # Check if answer follows expected format
-        if answer and answer.strip():
-            print("✅ Agent returned a non-empty answer")
-        else:
-            print("❌ Agent returned empty answer")
-    except Exception as e:
-        print(f"❌ Error testing agent: {e}")
-    print("\n" + "="*50)
-def test_api_format():
-    """Test that our submission format matches API expectations."""
-    print("📡 Testing API submission format...")
-    # Simulate what would be sent to the API
-    sample_submission = {
-        "task_id": "test_task_1",
-        "model_answer": "42",
-        "reasoning_trace": "I calculated 2+2 and got 4, but the question asks for something else..."
-    }
-    required_fields = ["task_id", "model_answer"]
-    optional_fields = ["reasoning_trace"]
-    print("Required fields check:")
-    for field in required_fields:
-        if field in sample_submission:
-            print(f"✅ {field}: {sample_submission[field]}")
-        else:
-            print(f"❌ Missing required field: {field}")
-    print("Optional fields check:")
-    for field in optional_fields:
-        if field in sample_submission:
-            print(f"✅ {field}: Present ({len(str(sample_submission[field]))} chars)")
-        else:
-            print(f"ℹ️  Optional field not present: {field}")
-if __name__ == "__main__":
-    print("🔧 GAIA Agent Format Testing")
-    print("="*50)
-    # Test 1: Answer formatting
-    test_answer_formatting()
-    # Test 2: Simple agent question
-    test_simple_question()
-    # Test 3: API format
-    test_api_format()
-    print("🏁 Testing complete!")