File size: 5,638 Bytes
5dd1bb4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
{
  "$schema": "autocode-verification-input-v1",
  "feature_id": "F002",
  "spec_path": "specs/F002-IMPLEMENTATION_SPEC.md",
  "generated": "2026-03-27T12:00:00Z",
  "verification_mode": "mvp",

  "overview": {
    "summary": "Type-aware answer verification for SQLEnv that replaces naive string comparison with dispatched comparers for integer (exact), float (1% tolerance), string (case-insensitive), and list (order-insensitive) answer types. Falls back to string comparison when answer_type is missing.",
    "goal": "Ensure correct agent answers are not rejected due to trivial formatting, type coercion, or ordering differences."
  },

  "interfaces": {
    "types": [
      {
        "name": "EpisodeContext",
        "fields": [
          {"name": "gold_rows", "type": "list[tuple] | None", "optional": true, "description": "Raw SQL result rows for accurate list comparison by verifier"}
        ],
        "description": "Per-episode server-side state. Modified to add gold_rows field alongside existing gold_answer."
      }
    ],
    "functions": [
      {
        "name": "verify_answer",
        "params": [
          {"name": "predicted", "type": "str", "description": "Agent's submitted answer string"},
          {"name": "gold", "type": "str", "description": "Gold answer as formatted string"},
          {"name": "answer_type", "type": "str | None", "default": "None", "description": "One of 'integer', 'float', 'string', 'list', or None"},
          {"name": "gold_rows", "type": "list[tuple] | None", "default": "None", "description": "Raw SQL result rows for list comparison"}
        ],
        "returns": "bool",
        "raises": [],
        "description": "Compare agent answer against gold answer using type-specific comparison. Dispatches by answer_type; falls back to string comparison for None/unknown types."
      },
      {
        "name": "_compare_integer",
        "params": [
          {"name": "predicted", "type": "str", "description": "Agent value"},
          {"name": "gold", "type": "str", "description": "Gold value"}
        ],
        "returns": "bool",
        "description": "Exact integer match after coercing both sides via int(float(x)). Returns False on ValueError."
      },
      {
        "name": "_compare_float",
        "params": [
          {"name": "predicted", "type": "str", "description": "Agent value"},
          {"name": "gold", "type": "str", "description": "Gold value"},
          {"name": "tolerance", "type": "float", "default": "0.01", "description": "Relative tolerance (1% default)"}
        ],
        "returns": "bool",
        "description": "Float comparison with relative tolerance. Uses abs(pred - gold) <= tolerance * abs(gold). For gold==0, uses absolute tolerance 1e-9."
      },
      {
        "name": "_compare_string",
        "params": [
          {"name": "predicted", "type": "str", "description": "Agent value"},
          {"name": "gold", "type": "str", "description": "Gold value"}
        ],
        "returns": "bool",
        "description": "Case-insensitive, whitespace-normalized string comparison."
      },
      {
        "name": "_compare_list",
        "params": [
          {"name": "predicted", "type": "str", "description": "Agent value"},
          {"name": "gold", "type": "str", "description": "Gold value as formatted string"},
          {"name": "gold_rows", "type": "list[tuple] | None", "default": "None", "description": "Raw rows for accurate comparison"}
        ],
        "returns": "bool",
        "description": "Order-insensitive set comparison. Parses both sides into normalized string sets and compares equality."
      }
    ],
    "api_endpoints": []
  },

  "data_flow": {
    "primary_flow": [
      "Agent sends ANSWER action with value string",
      "step() dispatches to _handle_answer(value)",
      "_handle_answer() calls verify_answer(predicted, gold, answer_type, gold_rows)",
      "verify_answer() dispatches to type-specific comparer based on answer_type",
      "Comparer returns bool; _handle_answer returns (bool, float reward)"
    ],
    "alternative_flows": [
      {
        "name": "Unknown or missing answer_type",
        "trigger": "answer_type is None or not in known set",
        "steps": [
          "verify_answer receives answer_type=None",
          "Falls back to _compare_string(predicted, gold)",
          "Returns bool"
        ]
      },
      {
        "name": "Type coercion failure",
        "trigger": "predicted cannot be parsed as int or float",
        "steps": [
          "_compare_integer or _compare_float catches ValueError",
          "Returns False (answer treated as incorrect)"
        ]
      },
      {
        "name": "Empty or None input",
        "trigger": "predicted is empty string after strip",
        "steps": [
          "verify_answer returns False immediately"
        ]
      }
    ]
  },

  "error_handling": {
    "error_types": [
      {
        "name": "ValueError",
        "when": "Predicted value cannot be coerced to int/float during comparison"
      },
      {
        "name": "RuntimeError",
        "when": "_handle_answer called with no active episode (existing behavior, unchanged)"
      }
    ],
    "retry_strategy": null
  },

  "dependencies": {
    "external": [],
    "internal": [
      {"name": "models.EpisodeContext", "usage": "gold_rows field added for verifier input"},
      {"name": "models.QuestionRecord", "usage": "answer_type field read to determine comparison strategy"},
      {"name": "server.sql_environment._handle_answer", "usage": "Modified to call verify_answer instead of inline comparison"}
    ]
  }
}