Spaces:

hjerpe
/

sql_env

Sleeping

File size: 5,638 Bytes

5dd1bb4

{
  "$schema": "autocode-verification-input-v1",
  "feature_id": "F002",
  "spec_path": "specs/F002-IMPLEMENTATION_SPEC.md",
  "generated": "2026-03-27T12:00:00Z",
  "verification_mode": "mvp",

  "overview": {
    "summary": "Type-aware answer verification for SQLEnv that replaces naive string comparison with dispatched comparers for integer (exact), float (1% tolerance), string (case-insensitive), and list (order-insensitive) answer types. Falls back to string comparison when answer_type is missing.",
    "goal": "Ensure correct agent answers are not rejected due to trivial formatting, type coercion, or ordering differences."
  },

  "interfaces": {
    "types": [
      {
        "name": "EpisodeContext",
        "fields": [
          {"name": "gold_rows", "type": "list[tuple] | None", "optional": true, "description": "Raw SQL result rows for accurate list comparison by verifier"}
        ],
        "description": "Per-episode server-side state. Modified to add gold_rows field alongside existing gold_answer."
      }
    ],
    "functions": [
      {
        "name": "verify_answer",
        "params": [
          {"name": "predicted", "type": "str", "description": "Agent's submitted answer string"},
          {"name": "gold", "type": "str", "description": "Gold answer as formatted string"},
          {"name": "answer_type", "type": "str | None", "default": "None", "description": "One of 'integer', 'float', 'string', 'list', or None"},
          {"name": "gold_rows", "type": "list[tuple] | None", "default": "None", "description": "Raw SQL result rows for list comparison"}
        ],
        "returns": "bool",
        "raises": [],
        "description": "Compare agent answer against gold answer using type-specific comparison. Dispatches by answer_type; falls back to string comparison for None/unknown types."
      },
      {
        "name": "_compare_integer",
        "params": [
          {"name": "predicted", "type": "str", "description": "Agent value"},
          {"name": "gold", "type": "str", "description": "Gold value"}
        ],
        "returns": "bool",
        "description": "Exact integer match after coercing both sides via int(float(x)). Returns False on ValueError."
      },
      {
        "name": "_compare_float",
        "params": [
          {"name": "predicted", "type": "str", "description": "Agent value"},
          {"name": "gold", "type": "str", "description": "Gold value"},
          {"name": "tolerance", "type": "float", "default": "0.01", "description": "Relative tolerance (1% default)"}
        ],
        "returns": "bool",
        "description": "Float comparison with relative tolerance. Uses abs(pred - gold) <= tolerance * abs(gold). For gold==0, uses absolute tolerance 1e-9."
      },
      {
        "name": "_compare_string",
        "params": [
          {"name": "predicted", "type": "str", "description": "Agent value"},
          {"name": "gold", "type": "str", "description": "Gold value"}
        ],
        "returns": "bool",
        "description": "Case-insensitive, whitespace-normalized string comparison."
      },
      {
        "name": "_compare_list",
        "params": [
          {"name": "predicted", "type": "str", "description": "Agent value"},
          {"name": "gold", "type": "str", "description": "Gold value as formatted string"},
          {"name": "gold_rows", "type": "list[tuple] | None", "default": "None", "description": "Raw rows for accurate comparison"}
        ],
        "returns": "bool",
        "description": "Order-insensitive set comparison. Parses both sides into normalized string sets and compares equality."
      }
    ],
    "api_endpoints": []
  },

  "data_flow": {
    "primary_flow": [
      "Agent sends ANSWER action with value string",
      "step() dispatches to _handle_answer(value)",
      "_handle_answer() calls verify_answer(predicted, gold, answer_type, gold_rows)",
      "verify_answer() dispatches to type-specific comparer based on answer_type",
      "Comparer returns bool; _handle_answer returns (bool, float reward)"
    ],
    "alternative_flows": [
      {
        "name": "Unknown or missing answer_type",
        "trigger": "answer_type is None or not in known set",
        "steps": [
          "verify_answer receives answer_type=None",
          "Falls back to _compare_string(predicted, gold)",
          "Returns bool"
        ]
      },
      {
        "name": "Type coercion failure",
        "trigger": "predicted cannot be parsed as int or float",
        "steps": [
          "_compare_integer or _compare_float catches ValueError",
          "Returns False (answer treated as incorrect)"
        ]
      },
      {
        "name": "Empty or None input",
        "trigger": "predicted is empty string after strip",
        "steps": [
          "verify_answer returns False immediately"
        ]
      }
    ]
  },

  "error_handling": {
    "error_types": [
      {
        "name": "ValueError",
        "when": "Predicted value cannot be coerced to int/float during comparison"
      },
      {
        "name": "RuntimeError",
        "when": "_handle_answer called with no active episode (existing behavior, unchanged)"
      }
    ],
    "retry_strategy": null
  },

  "dependencies": {
    "external": [],
    "internal": [
      {"name": "models.EpisodeContext", "usage": "gold_rows field added for verifier input"},
      {"name": "models.QuestionRecord", "usage": "answer_type field read to determine comparison strategy"},
      {"name": "server.sql_environment._handle_answer", "usage": "Modified to call verify_answer instead of inline comparison"}
    ]
  }
}