| { |
| "$schema": "autocode-verification-input-v1", |
| "feature_id": "F001", |
| "spec_path": "specs/F001-IMPLEMENTATION_SPEC.md", |
| "generated": "2026-03-24T12:00:00Z", |
| "verification_mode": "mvp", |
|
|
| "overview": { |
| "summary": "Complete the step/reset lifecycle so the SQL environment actually executes SQL queries against real Spider SQLite databases. Replace the non-functional Ollama-based action interpretation with structured actions (DESCRIBE, SAMPLE, QUERY, ANSWER) that the agent provides directly. Implement sandboxed SQL execution (read-only, SELECT-only, 5s timeout, 20-row truncation), question loading from Spider JSON, per-episode state management via EpisodeContext, and a 15-step budget.", |
| "goal": "Enable agents to play complete RL episodes: reset with a random question, explore a hidden schema via DESCRIBE/SAMPLE, run SQL queries, and submit answers against real databases." |
| }, |
|
|
| "interfaces": { |
| "types": [ |
| { |
| "name": "SQLAction", |
| "fields": [ |
| {"name": "action_type", "type": "str", "description": "One of: DESCRIBE, SAMPLE, QUERY, ANSWER"}, |
| {"name": "argument", "type": "str", "description": "Table name (DESCRIBE/SAMPLE), SQL string (QUERY), or answer value (ANSWER)"} |
| ], |
| "description": "Structured action from agent to environment. Extends openenv Action base." |
| }, |
| { |
| "name": "SQLObservation", |
| "fields": [ |
| {"name": "done", "type": "bool", "description": "Whether the episode has ended"}, |
| {"name": "reward", "type": "float | None", "description": "Reward signal (set on terminal step)"}, |
| {"name": "question", "type": "str", "description": "The NL question to answer"}, |
| {"name": "schema_info", "type": "str", "description": "Known schema info (table names initially, columns added after DESCRIBE)"}, |
| {"name": "result", "type": "str", "description": "Result of last action (truncated to 20 rows)"}, |
| {"name": "error", "type": "str", "description": "Error message if action failed, empty string otherwise"}, |
| {"name": "step_count", "type": "int", "description": "Current step number (0-indexed)"}, |
| {"name": "budget_remaining", "type": "int", "description": "Steps left before forced termination"}, |
| {"name": "action_history", "type": "list[str]", "description": "Summary of previous actions taken"} |
| ], |
| "description": "Rich observation from environment to agent. Extends openenv Observation base." |
| }, |
| { |
| "name": "QuestionRecord", |
| "fields": [ |
| {"name": "question_id", "type": "str", "description": "Unique identifier for the question"}, |
| {"name": "question_text", "type": "str", "description": "Natural language question"}, |
| {"name": "database_name", "type": "str", "description": "Which SQLite database to load (matches db_id)"}, |
| {"name": "gold_sql", "type": "str", "description": "Reference SQL query (hidden from agent)"}, |
| {"name": "gold_answer", "type": "str", "description": "Expected answer (hidden from agent)"}, |
| {"name": "answer_type", "type": "str", "description": "One of: integer, float, string, list"}, |
| {"name": "difficulty", "type": "str", "description": "One of: easy, medium, hard"}, |
| {"name": "tables_involved", "type": "list[str]", "description": "Tables referenced by gold query"} |
| ], |
| "description": "Metadata for a single question from the Spider dataset. Server-side only." |
| }, |
| { |
| "name": "EpisodeContext", |
| "fields": [ |
| {"name": "episode_id", "type": "str", "description": "Unique episode identifier"}, |
| {"name": "db_connection", "type": "sqlite3.Connection", "description": "Read-only connection to episode database"}, |
| {"name": "question_record", "type": "QuestionRecord", "description": "The selected question for this episode"}, |
| {"name": "step_count", "type": "int", "description": "Current step number"}, |
| {"name": "budget", "type": "int", "description": "Steps remaining (default 15)"}, |
| {"name": "described_tables", "type": "set[str]", "description": "Tables the agent has DESCRIBEd"}, |
| {"name": "action_log", "type": "list[str]", "description": "Human-readable action summaries"}, |
| {"name": "done", "type": "bool", "description": "Whether the episode has ended"}, |
| {"name": "gold_answer", "type": "str | None", "description": "Computed at reset by running gold_sql"} |
| ], |
| "description": "Per-episode server-side state. Never sent to agent." |
| } |
| ], |
| "functions": [ |
| { |
| "name": "SQLEnvironment.__init__", |
| "params": [ |
| {"name": "questions_path", "type": "str", "description": "Path to Spider questions JSON file"}, |
| {"name": "db_dir", "type": "str", "description": "Directory containing Spider SQLite database files"}, |
| {"name": "tokenizer", "type": "ModelTokenizer", "description": "OpenEnv tokenizer for compatibility"}, |
| {"name": "step_budget", "type": "int", "default": "15", "description": "Maximum steps per episode"} |
| ], |
| "returns": "None", |
| "raises": ["FileNotFoundError", "ValueError"], |
| "description": "Initialize environment with question dataset and database directory. Loads questions at init time." |
| }, |
| { |
| "name": "SQLEnvironment.reset", |
| "params": [ |
| {"name": "seed", "type": "int | None", "default": "None", "description": "Random seed for question selection"}, |
| {"name": "episode_id", "type": "str | None", "default": "None", "description": "Optional episode identifier"} |
| ], |
| "returns": "SQLObservation", |
| "raises": ["FileNotFoundError"], |
| "description": "Pick random question, open read-only SQLite, compute gold answer, return initial observation with question text and table names." |
| }, |
| { |
| "name": "SQLEnvironment.step", |
| "params": [ |
| {"name": "action", "type": "SQLAction", "description": "Structured action with action_type and argument"}, |
| {"name": "timeout_s", "type": "float", "default": "30", "description": "Overall step timeout"} |
| ], |
| "returns": "SQLObservation", |
| "raises": [], |
| "description": "Dispatch action to handler, update episode context, enforce budget, return observation. Never raises -- errors are in observation.error field." |
| }, |
| { |
| "name": "SQLEnvironment._execute_sql", |
| "params": [ |
| {"name": "sql", "type": "str", "description": "SQL query to execute"}, |
| {"name": "timeout_s", "type": "float", "default": "5.0", "description": "Maximum execution time"} |
| ], |
| "returns": "list[tuple]", |
| "raises": ["ValueError", "sqlite3.OperationalError"], |
| "description": "Sandboxed SQL execution with SELECT-only validation, read-only connection, timeout via progress_handler, and result truncation." |
| }, |
| { |
| "name": "SQLEnvironment._handle_describe", |
| "params": [ |
| {"name": "table_name", "type": "str", "description": "Name of table to describe"} |
| ], |
| "returns": "str", |
| "description": "Return column names, types, and row count for a table. Returns error string if table not found, listing available tables." |
| }, |
| { |
| "name": "SQLEnvironment._handle_sample", |
| "params": [ |
| {"name": "table_name", "type": "str", "description": "Name of table to sample"}, |
| {"name": "limit", "type": "int", "default": "5", "description": "Number of rows to return"} |
| ], |
| "returns": "str", |
| "description": "Execute SELECT * FROM table LIMIT N via _execute_sql, return formatted rows." |
| }, |
| { |
| "name": "SQLEnvironment._handle_query", |
| "params": [ |
| {"name": "sql", "type": "str", "description": "SQL SELECT query to execute"} |
| ], |
| "returns": "str", |
| "description": "Validate SELECT-only, execute with 5s timeout, format results, truncate to 20 rows with indicator." |
| }, |
| { |
| "name": "SQLEnvironment._handle_answer", |
| "params": [ |
| {"name": "value", "type": "str", "description": "Agent's answer string"} |
| ], |
| "returns": "tuple[bool, float]", |
| "description": "Compare to gold answer (case-insensitive string comparison for MVP). Returns (is_correct, reward). Sets episode done=True." |
| }, |
| { |
| "name": "SQLEnvironment._build_observation", |
| "params": [], |
| "returns": "SQLObservation", |
| "description": "Construct rich SQLObservation from current EpisodeContext state." |
| }, |
| { |
| "name": "SQLEnvironment._load_questions", |
| "params": [ |
| {"name": "path", "type": "str", "description": "Path to questions JSON file"} |
| ], |
| "returns": "list[QuestionRecord]", |
| "raises": ["FileNotFoundError", "ValueError"], |
| "description": "Load Spider question JSON and parse into QuestionRecord list." |
| }, |
| { |
| "name": "SQLEnvironment._open_db", |
| "params": [ |
| {"name": "db_name", "type": "str", "description": "Database name (matches db_id in questions)"} |
| ], |
| "returns": "sqlite3.Connection", |
| "raises": ["FileNotFoundError"], |
| "description": "Open read-only SQLite connection using URI file:{path}?mode=ro." |
| } |
| ], |
| "api_endpoints": [ |
| { |
| "method": "POST", |
| "path": "/reset", |
| "request_body": { |
| "type": "object", |
| "fields": ["seed: int | null", "episode_id: str | null"] |
| }, |
| "response_body": { |
| "type": "SQLObservation" |
| }, |
| "errors": [ |
| {"status": 500, "when": "Database file not found or questions file missing"} |
| ] |
| }, |
| { |
| "method": "POST", |
| "path": "/step", |
| "request_body": { |
| "type": "SQLAction", |
| "fields": ["action_type: str", "argument: str"] |
| }, |
| "response_body": { |
| "type": "SQLObservation" |
| }, |
| "errors": [ |
| {"status": 422, "when": "Invalid action schema (missing action_type or argument)"} |
| ] |
| } |
| ] |
| }, |
|
|
| "data_flow": { |
| "primary_flow": [ |
| "Agent calls POST /reset to start a new episode", |
| "Environment picks a random QuestionRecord from loaded questions", |
| "Environment opens read-only SQLite connection for the question's database", |
| "Environment executes gold_sql to compute gold_answer (stored server-side)", |
| "Environment creates EpisodeContext with step_count=0, budget=15", |
| "Environment returns SQLObservation with question text and table names (columns hidden)", |
| "Agent calls POST /step with SQLAction (DESCRIBE/SAMPLE/QUERY/ANSWER)", |
| "Environment dispatches to appropriate handler based on action_type", |
| "Handler executes against SQLite (DESCRIBE/SAMPLE/QUERY) or compares answer (ANSWER)", |
| "Environment updates EpisodeContext: step_count++, budget-- (except ANSWER)", |
| "Environment checks budget exhaustion and sets done=True if budget==0", |
| "Environment returns SQLObservation with result/error, updated budget, action_history" |
| ], |
| "alternative_flows": [ |
| { |
| "name": "ANSWER submission", |
| "trigger": "Agent sends action_type=ANSWER", |
| "steps": [ |
| "Compare argument to gold_answer (case-insensitive, stripped)", |
| "Set done=True, reward=1.0 (correct) or 0.0 (incorrect)", |
| "Do NOT decrement budget", |
| "Return terminal observation" |
| ] |
| }, |
| { |
| "name": "Budget exhaustion", |
| "trigger": "Budget reaches 0 after a DESCRIBE/SAMPLE/QUERY step", |
| "steps": [ |
| "Set done=True, reward=0.0", |
| "Return terminal observation with done=True" |
| ] |
| }, |
| { |
| "name": "Invalid SQL", |
| "trigger": "Agent sends non-SELECT query or malformed SQL", |
| "steps": [ |
| "Reject at SELECT-only validation or catch sqlite3 error", |
| "Set observation.error with descriptive message", |
| "Step still counts against budget", |
| "Return observation with error field populated" |
| ] |
| }, |
| { |
| "name": "Query timeout", |
| "trigger": "SQL execution exceeds 5 seconds", |
| "steps": [ |
| "Interrupt query via sqlite3 progress_handler", |
| "Set observation.error to timeout message", |
| "Step counts against budget" |
| ] |
| }, |
| { |
| "name": "Table not found", |
| "trigger": "DESCRIBE/SAMPLE with nonexistent table name", |
| "steps": [ |
| "Return error listing available table names", |
| "Step counts against budget" |
| ] |
| } |
| ] |
| }, |
|
|
| "error_handling": { |
| "error_types": [ |
| { |
| "name": "InvalidActionType", |
| "when": "action_type not in {DESCRIBE, SAMPLE, QUERY, ANSWER}", |
| "message_template": "Unknown action type '{action_type}'. Valid types: DESCRIBE, SAMPLE, QUERY, ANSWER" |
| }, |
| { |
| "name": "TableNotFound", |
| "when": "DESCRIBE or SAMPLE with table name not in database", |
| "message_template": "Table '{table_name}' not found. Available tables: {table_list}" |
| }, |
| { |
| "name": "NonSelectQuery", |
| "when": "QUERY action with SQL that is not a SELECT statement", |
| "message_template": "Only SELECT queries are allowed. Got: {first_keyword}" |
| }, |
| { |
| "name": "SQLSyntaxError", |
| "when": "SELECT query with invalid syntax", |
| "message_template": "SQL error: {sqlite3_error_message}" |
| }, |
| { |
| "name": "QueryTimeout", |
| "when": "SQL execution exceeds 5 second timeout", |
| "message_template": "Query timed out after 5.0 seconds" |
| }, |
| { |
| "name": "EmptyArgument", |
| "when": "argument field is empty or whitespace-only", |
| "message_template": "Argument cannot be empty for {action_type}" |
| }, |
| { |
| "name": "DatabaseNotFound", |
| "when": "SQLite file not found during reset", |
| "message_template": "Database '{db_name}' not found in {db_dir}" |
| } |
| ], |
| "retry_strategy": null |
| }, |
|
|
| "dependencies": { |
| "external": [ |
| "sqlite3 (stdlib)", |
| "pydantic", |
| "openenv (core.env_server)", |
| "torch" |
| ], |
| "internal": [ |
| "models.py", |
| "server/sql_environment.py", |
| "server/app.py", |
| "client.py", |
| "data/databases/models.py", |
| "data/questions/student_assessment.json" |
| ] |
| } |
| } |
|
|