Denis Davydov
commited on
Commit
Β·
2e79a34
1
Parent(s):
a5c9e62
fix submitted_answer
Browse files- app.py +2 -2
- test_agent_format.py +0 -99
app.py
CHANGED
|
@@ -74,7 +74,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
| 74 |
submitted_answer, reasoning_trace = agent(question_text, task_id)
|
| 75 |
answers_payload.append({
|
| 76 |
"task_id": task_id,
|
| 77 |
-
"
|
| 78 |
"reasoning_trace": reasoning_trace
|
| 79 |
})
|
| 80 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
|
@@ -83,7 +83,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
| 83 |
error_answer = f"AGENT ERROR: {e}"
|
| 84 |
answers_payload.append({
|
| 85 |
"task_id": task_id,
|
| 86 |
-
"
|
| 87 |
"reasoning_trace": f"Error occurred: {str(e)}"
|
| 88 |
})
|
| 89 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": error_answer})
|
|
|
|
| 74 |
submitted_answer, reasoning_trace = agent(question_text, task_id)
|
| 75 |
answers_payload.append({
|
| 76 |
"task_id": task_id,
|
| 77 |
+
"submitted_answer": submitted_answer,
|
| 78 |
"reasoning_trace": reasoning_trace
|
| 79 |
})
|
| 80 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
|
|
|
| 83 |
error_answer = f"AGENT ERROR: {e}"
|
| 84 |
answers_payload.append({
|
| 85 |
"task_id": task_id,
|
| 86 |
+
"submitted_answer": error_answer,
|
| 87 |
"reasoning_trace": f"Error occurred: {str(e)}"
|
| 88 |
})
|
| 89 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": error_answer})
|
test_agent_format.py
DELETED
|
@@ -1,99 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Test script to verify the agent's answer formatting works correctly.
|
| 4 |
-
"""
|
| 5 |
-
|
| 6 |
-
import os
|
| 7 |
-
from agent import smart_agent
|
| 8 |
-
from utils import format_gaia_answer
|
| 9 |
-
|
| 10 |
-
def test_answer_formatting():
|
| 11 |
-
"""Test the answer formatting function with various inputs."""
|
| 12 |
-
|
| 13 |
-
test_cases = [
|
| 14 |
-
# Test case: (raw_answer, expected_format)
|
| 15 |
-
("I think the answer is 42. FINAL ANSWER: 42", "42"),
|
| 16 |
-
("Let me calculate... FINAL ANSWER: 3.14159", "3.14159"),
|
| 17 |
-
("After research, FINAL ANSWER: New York", "New York"),
|
| 18 |
-
("The result is FINAL ANSWER: apple, banana, cherry", "apple, banana, cherry"),
|
| 19 |
-
("FINAL ANSWER: 1,234", "1234"), # Should remove commas from numbers
|
| 20 |
-
("FINAL ANSWER: \"Hello World\"", "Hello World"), # Should remove quotes
|
| 21 |
-
("FINAL ANSWER: approximately 100", "100"), # Should remove qualifiers
|
| 22 |
-
("No clear final answer format here", "No clear final answer format here"), # Fallback
|
| 23 |
-
]
|
| 24 |
-
|
| 25 |
-
print("π§ͺ Testing answer formatting...")
|
| 26 |
-
for i, (raw, expected) in enumerate(test_cases, 1):
|
| 27 |
-
result = format_gaia_answer(raw)
|
| 28 |
-
status = "β
" if result == expected else "β"
|
| 29 |
-
print(f"{status} Test {i}: '{raw}' -> '{result}' (expected: '{expected}')")
|
| 30 |
-
if result != expected:
|
| 31 |
-
print(f" β οΈ Mismatch detected!")
|
| 32 |
-
|
| 33 |
-
print("\n" + "="*50)
|
| 34 |
-
|
| 35 |
-
def test_simple_question():
|
| 36 |
-
"""Test the agent with a simple question."""
|
| 37 |
-
print("π€ Testing agent with a simple question...")
|
| 38 |
-
|
| 39 |
-
question = "What is 2 + 2?"
|
| 40 |
-
try:
|
| 41 |
-
answer, reasoning = smart_agent(question)
|
| 42 |
-
print(f"Question: {question}")
|
| 43 |
-
print(f"Answer: {answer}")
|
| 44 |
-
print(f"Reasoning length: {len(reasoning)} characters")
|
| 45 |
-
print(f"Raw reasoning preview: {reasoning[:200]}...")
|
| 46 |
-
|
| 47 |
-
# Check if answer follows expected format
|
| 48 |
-
if answer and answer.strip():
|
| 49 |
-
print("β
Agent returned a non-empty answer")
|
| 50 |
-
else:
|
| 51 |
-
print("β Agent returned empty answer")
|
| 52 |
-
|
| 53 |
-
except Exception as e:
|
| 54 |
-
print(f"β Error testing agent: {e}")
|
| 55 |
-
|
| 56 |
-
print("\n" + "="*50)
|
| 57 |
-
|
| 58 |
-
def test_api_format():
|
| 59 |
-
"""Test that our submission format matches API expectations."""
|
| 60 |
-
print("π‘ Testing API submission format...")
|
| 61 |
-
|
| 62 |
-
# Simulate what would be sent to the API
|
| 63 |
-
sample_submission = {
|
| 64 |
-
"task_id": "test_task_1",
|
| 65 |
-
"model_answer": "42",
|
| 66 |
-
"reasoning_trace": "I calculated 2+2 and got 4, but the question asks for something else..."
|
| 67 |
-
}
|
| 68 |
-
|
| 69 |
-
required_fields = ["task_id", "model_answer"]
|
| 70 |
-
optional_fields = ["reasoning_trace"]
|
| 71 |
-
|
| 72 |
-
print("Required fields check:")
|
| 73 |
-
for field in required_fields:
|
| 74 |
-
if field in sample_submission:
|
| 75 |
-
print(f"β
{field}: {sample_submission[field]}")
|
| 76 |
-
else:
|
| 77 |
-
print(f"β Missing required field: {field}")
|
| 78 |
-
|
| 79 |
-
print("Optional fields check:")
|
| 80 |
-
for field in optional_fields:
|
| 81 |
-
if field in sample_submission:
|
| 82 |
-
print(f"β
{field}: Present ({len(str(sample_submission[field]))} chars)")
|
| 83 |
-
else:
|
| 84 |
-
print(f"βΉοΈ Optional field not present: {field}")
|
| 85 |
-
|
| 86 |
-
if __name__ == "__main__":
|
| 87 |
-
print("π§ GAIA Agent Format Testing")
|
| 88 |
-
print("="*50)
|
| 89 |
-
|
| 90 |
-
# Test 1: Answer formatting
|
| 91 |
-
test_answer_formatting()
|
| 92 |
-
|
| 93 |
-
# Test 2: Simple agent question
|
| 94 |
-
test_simple_question()
|
| 95 |
-
|
| 96 |
-
# Test 3: API format
|
| 97 |
-
test_api_format()
|
| 98 |
-
|
| 99 |
-
print("π Testing complete!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|