File size: 5,638 Bytes
5dd1bb4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 | {
"$schema": "autocode-verification-input-v1",
"feature_id": "F002",
"spec_path": "specs/F002-IMPLEMENTATION_SPEC.md",
"generated": "2026-03-27T12:00:00Z",
"verification_mode": "mvp",
"overview": {
"summary": "Type-aware answer verification for SQLEnv that replaces naive string comparison with dispatched comparers for integer (exact), float (1% tolerance), string (case-insensitive), and list (order-insensitive) answer types. Falls back to string comparison when answer_type is missing.",
"goal": "Ensure correct agent answers are not rejected due to trivial formatting, type coercion, or ordering differences."
},
"interfaces": {
"types": [
{
"name": "EpisodeContext",
"fields": [
{"name": "gold_rows", "type": "list[tuple] | None", "optional": true, "description": "Raw SQL result rows for accurate list comparison by verifier"}
],
"description": "Per-episode server-side state. Modified to add gold_rows field alongside existing gold_answer."
}
],
"functions": [
{
"name": "verify_answer",
"params": [
{"name": "predicted", "type": "str", "description": "Agent's submitted answer string"},
{"name": "gold", "type": "str", "description": "Gold answer as formatted string"},
{"name": "answer_type", "type": "str | None", "default": "None", "description": "One of 'integer', 'float', 'string', 'list', or None"},
{"name": "gold_rows", "type": "list[tuple] | None", "default": "None", "description": "Raw SQL result rows for list comparison"}
],
"returns": "bool",
"raises": [],
"description": "Compare agent answer against gold answer using type-specific comparison. Dispatches by answer_type; falls back to string comparison for None/unknown types."
},
{
"name": "_compare_integer",
"params": [
{"name": "predicted", "type": "str", "description": "Agent value"},
{"name": "gold", "type": "str", "description": "Gold value"}
],
"returns": "bool",
"description": "Exact integer match after coercing both sides via int(float(x)). Returns False on ValueError."
},
{
"name": "_compare_float",
"params": [
{"name": "predicted", "type": "str", "description": "Agent value"},
{"name": "gold", "type": "str", "description": "Gold value"},
{"name": "tolerance", "type": "float", "default": "0.01", "description": "Relative tolerance (1% default)"}
],
"returns": "bool",
"description": "Float comparison with relative tolerance. Uses abs(pred - gold) <= tolerance * abs(gold). For gold==0, uses absolute tolerance 1e-9."
},
{
"name": "_compare_string",
"params": [
{"name": "predicted", "type": "str", "description": "Agent value"},
{"name": "gold", "type": "str", "description": "Gold value"}
],
"returns": "bool",
"description": "Case-insensitive, whitespace-normalized string comparison."
},
{
"name": "_compare_list",
"params": [
{"name": "predicted", "type": "str", "description": "Agent value"},
{"name": "gold", "type": "str", "description": "Gold value as formatted string"},
{"name": "gold_rows", "type": "list[tuple] | None", "default": "None", "description": "Raw rows for accurate comparison"}
],
"returns": "bool",
"description": "Order-insensitive set comparison. Parses both sides into normalized string sets and compares equality."
}
],
"api_endpoints": []
},
"data_flow": {
"primary_flow": [
"Agent sends ANSWER action with value string",
"step() dispatches to _handle_answer(value)",
"_handle_answer() calls verify_answer(predicted, gold, answer_type, gold_rows)",
"verify_answer() dispatches to type-specific comparer based on answer_type",
"Comparer returns bool; _handle_answer returns (bool, float reward)"
],
"alternative_flows": [
{
"name": "Unknown or missing answer_type",
"trigger": "answer_type is None or not in known set",
"steps": [
"verify_answer receives answer_type=None",
"Falls back to _compare_string(predicted, gold)",
"Returns bool"
]
},
{
"name": "Type coercion failure",
"trigger": "predicted cannot be parsed as int or float",
"steps": [
"_compare_integer or _compare_float catches ValueError",
"Returns False (answer treated as incorrect)"
]
},
{
"name": "Empty or None input",
"trigger": "predicted is empty string after strip",
"steps": [
"verify_answer returns False immediately"
]
}
]
},
"error_handling": {
"error_types": [
{
"name": "ValueError",
"when": "Predicted value cannot be coerced to int/float during comparison"
},
{
"name": "RuntimeError",
"when": "_handle_answer called with no active episode (existing behavior, unchanged)"
}
],
"retry_strategy": null
},
"dependencies": {
"external": [],
"internal": [
{"name": "models.EpisodeContext", "usage": "gold_rows field added for verifier input"},
{"name": "models.QuestionRecord", "usage": "answer_type field read to determine comparison strategy"},
{"name": "server.sql_environment._handle_answer", "usage": "Modified to call verify_answer instead of inline comparison"}
]
}
}
|