sql_env / specs /F001-VERIFICATION_INPUT.json
hjerpe's picture
Upload folder using huggingface_hub
5dd1bb4 verified
Raw
History Blame Contribute Delete
14.6 kB
{
"$schema": "autocode-verification-input-v1",
"feature_id": "F001",
"spec_path": "specs/F001-IMPLEMENTATION_SPEC.md",
"generated": "2026-03-24T12:00:00Z",
"verification_mode": "mvp",
"overview": {
"summary": "Complete the step/reset lifecycle so the SQL environment actually executes SQL queries against real Spider SQLite databases. Replace the non-functional Ollama-based action interpretation with structured actions (DESCRIBE, SAMPLE, QUERY, ANSWER) that the agent provides directly. Implement sandboxed SQL execution (read-only, SELECT-only, 5s timeout, 20-row truncation), question loading from Spider JSON, per-episode state management via EpisodeContext, and a 15-step budget.",
"goal": "Enable agents to play complete RL episodes: reset with a random question, explore a hidden schema via DESCRIBE/SAMPLE, run SQL queries, and submit answers against real databases."
},
"interfaces": {
"types": [
{
"name": "SQLAction",
"fields": [
{"name": "action_type", "type": "str", "description": "One of: DESCRIBE, SAMPLE, QUERY, ANSWER"},
{"name": "argument", "type": "str", "description": "Table name (DESCRIBE/SAMPLE), SQL string (QUERY), or answer value (ANSWER)"}
],
"description": "Structured action from agent to environment. Extends openenv Action base."
},
{
"name": "SQLObservation",
"fields": [
{"name": "done", "type": "bool", "description": "Whether the episode has ended"},
{"name": "reward", "type": "float | None", "description": "Reward signal (set on terminal step)"},
{"name": "question", "type": "str", "description": "The NL question to answer"},
{"name": "schema_info", "type": "str", "description": "Known schema info (table names initially, columns added after DESCRIBE)"},
{"name": "result", "type": "str", "description": "Result of last action (truncated to 20 rows)"},
{"name": "error", "type": "str", "description": "Error message if action failed, empty string otherwise"},
{"name": "step_count", "type": "int", "description": "Current step number (0-indexed)"},
{"name": "budget_remaining", "type": "int", "description": "Steps left before forced termination"},
{"name": "action_history", "type": "list[str]", "description": "Summary of previous actions taken"}
],
"description": "Rich observation from environment to agent. Extends openenv Observation base."
},
{
"name": "QuestionRecord",
"fields": [
{"name": "question_id", "type": "str", "description": "Unique identifier for the question"},
{"name": "question_text", "type": "str", "description": "Natural language question"},
{"name": "database_name", "type": "str", "description": "Which SQLite database to load (matches db_id)"},
{"name": "gold_sql", "type": "str", "description": "Reference SQL query (hidden from agent)"},
{"name": "gold_answer", "type": "str", "description": "Expected answer (hidden from agent)"},
{"name": "answer_type", "type": "str", "description": "One of: integer, float, string, list"},
{"name": "difficulty", "type": "str", "description": "One of: easy, medium, hard"},
{"name": "tables_involved", "type": "list[str]", "description": "Tables referenced by gold query"}
],
"description": "Metadata for a single question from the Spider dataset. Server-side only."
},
{
"name": "EpisodeContext",
"fields": [
{"name": "episode_id", "type": "str", "description": "Unique episode identifier"},
{"name": "db_connection", "type": "sqlite3.Connection", "description": "Read-only connection to episode database"},
{"name": "question_record", "type": "QuestionRecord", "description": "The selected question for this episode"},
{"name": "step_count", "type": "int", "description": "Current step number"},
{"name": "budget", "type": "int", "description": "Steps remaining (default 15)"},
{"name": "described_tables", "type": "set[str]", "description": "Tables the agent has DESCRIBEd"},
{"name": "action_log", "type": "list[str]", "description": "Human-readable action summaries"},
{"name": "done", "type": "bool", "description": "Whether the episode has ended"},
{"name": "gold_answer", "type": "str | None", "description": "Computed at reset by running gold_sql"}
],
"description": "Per-episode server-side state. Never sent to agent."
}
],
"functions": [
{
"name": "SQLEnvironment.__init__",
"params": [
{"name": "questions_path", "type": "str", "description": "Path to Spider questions JSON file"},
{"name": "db_dir", "type": "str", "description": "Directory containing Spider SQLite database files"},
{"name": "tokenizer", "type": "ModelTokenizer", "description": "OpenEnv tokenizer for compatibility"},
{"name": "step_budget", "type": "int", "default": "15", "description": "Maximum steps per episode"}
],
"returns": "None",
"raises": ["FileNotFoundError", "ValueError"],
"description": "Initialize environment with question dataset and database directory. Loads questions at init time."
},
{
"name": "SQLEnvironment.reset",
"params": [
{"name": "seed", "type": "int | None", "default": "None", "description": "Random seed for question selection"},
{"name": "episode_id", "type": "str | None", "default": "None", "description": "Optional episode identifier"}
],
"returns": "SQLObservation",
"raises": ["FileNotFoundError"],
"description": "Pick random question, open read-only SQLite, compute gold answer, return initial observation with question text and table names."
},
{
"name": "SQLEnvironment.step",
"params": [
{"name": "action", "type": "SQLAction", "description": "Structured action with action_type and argument"},
{"name": "timeout_s", "type": "float", "default": "30", "description": "Overall step timeout"}
],
"returns": "SQLObservation",
"raises": [],
"description": "Dispatch action to handler, update episode context, enforce budget, return observation. Never raises -- errors are in observation.error field."
},
{
"name": "SQLEnvironment._execute_sql",
"params": [
{"name": "sql", "type": "str", "description": "SQL query to execute"},
{"name": "timeout_s", "type": "float", "default": "5.0", "description": "Maximum execution time"}
],
"returns": "list[tuple]",
"raises": ["ValueError", "sqlite3.OperationalError"],
"description": "Sandboxed SQL execution with SELECT-only validation, read-only connection, timeout via progress_handler, and result truncation."
},
{
"name": "SQLEnvironment._handle_describe",
"params": [
{"name": "table_name", "type": "str", "description": "Name of table to describe"}
],
"returns": "str",
"description": "Return column names, types, and row count for a table. Returns error string if table not found, listing available tables."
},
{
"name": "SQLEnvironment._handle_sample",
"params": [
{"name": "table_name", "type": "str", "description": "Name of table to sample"},
{"name": "limit", "type": "int", "default": "5", "description": "Number of rows to return"}
],
"returns": "str",
"description": "Execute SELECT * FROM table LIMIT N via _execute_sql, return formatted rows."
},
{
"name": "SQLEnvironment._handle_query",
"params": [
{"name": "sql", "type": "str", "description": "SQL SELECT query to execute"}
],
"returns": "str",
"description": "Validate SELECT-only, execute with 5s timeout, format results, truncate to 20 rows with indicator."
},
{
"name": "SQLEnvironment._handle_answer",
"params": [
{"name": "value", "type": "str", "description": "Agent's answer string"}
],
"returns": "tuple[bool, float]",
"description": "Compare to gold answer (case-insensitive string comparison for MVP). Returns (is_correct, reward). Sets episode done=True."
},
{
"name": "SQLEnvironment._build_observation",
"params": [],
"returns": "SQLObservation",
"description": "Construct rich SQLObservation from current EpisodeContext state."
},
{
"name": "SQLEnvironment._load_questions",
"params": [
{"name": "path", "type": "str", "description": "Path to questions JSON file"}
],
"returns": "list[QuestionRecord]",
"raises": ["FileNotFoundError", "ValueError"],
"description": "Load Spider question JSON and parse into QuestionRecord list."
},
{
"name": "SQLEnvironment._open_db",
"params": [
{"name": "db_name", "type": "str", "description": "Database name (matches db_id in questions)"}
],
"returns": "sqlite3.Connection",
"raises": ["FileNotFoundError"],
"description": "Open read-only SQLite connection using URI file:{path}?mode=ro."
}
],
"api_endpoints": [
{
"method": "POST",
"path": "/reset",
"request_body": {
"type": "object",
"fields": ["seed: int | null", "episode_id: str | null"]
},
"response_body": {
"type": "SQLObservation"
},
"errors": [
{"status": 500, "when": "Database file not found or questions file missing"}
]
},
{
"method": "POST",
"path": "/step",
"request_body": {
"type": "SQLAction",
"fields": ["action_type: str", "argument: str"]
},
"response_body": {
"type": "SQLObservation"
},
"errors": [
{"status": 422, "when": "Invalid action schema (missing action_type or argument)"}
]
}
]
},
"data_flow": {
"primary_flow": [
"Agent calls POST /reset to start a new episode",
"Environment picks a random QuestionRecord from loaded questions",
"Environment opens read-only SQLite connection for the question's database",
"Environment executes gold_sql to compute gold_answer (stored server-side)",
"Environment creates EpisodeContext with step_count=0, budget=15",
"Environment returns SQLObservation with question text and table names (columns hidden)",
"Agent calls POST /step with SQLAction (DESCRIBE/SAMPLE/QUERY/ANSWER)",
"Environment dispatches to appropriate handler based on action_type",
"Handler executes against SQLite (DESCRIBE/SAMPLE/QUERY) or compares answer (ANSWER)",
"Environment updates EpisodeContext: step_count++, budget-- (except ANSWER)",
"Environment checks budget exhaustion and sets done=True if budget==0",
"Environment returns SQLObservation with result/error, updated budget, action_history"
],
"alternative_flows": [
{
"name": "ANSWER submission",
"trigger": "Agent sends action_type=ANSWER",
"steps": [
"Compare argument to gold_answer (case-insensitive, stripped)",
"Set done=True, reward=1.0 (correct) or 0.0 (incorrect)",
"Do NOT decrement budget",
"Return terminal observation"
]
},
{
"name": "Budget exhaustion",
"trigger": "Budget reaches 0 after a DESCRIBE/SAMPLE/QUERY step",
"steps": [
"Set done=True, reward=0.0",
"Return terminal observation with done=True"
]
},
{
"name": "Invalid SQL",
"trigger": "Agent sends non-SELECT query or malformed SQL",
"steps": [
"Reject at SELECT-only validation or catch sqlite3 error",
"Set observation.error with descriptive message",
"Step still counts against budget",
"Return observation with error field populated"
]
},
{
"name": "Query timeout",
"trigger": "SQL execution exceeds 5 seconds",
"steps": [
"Interrupt query via sqlite3 progress_handler",
"Set observation.error to timeout message",
"Step counts against budget"
]
},
{
"name": "Table not found",
"trigger": "DESCRIBE/SAMPLE with nonexistent table name",
"steps": [
"Return error listing available table names",
"Step counts against budget"
]
}
]
},
"error_handling": {
"error_types": [
{
"name": "InvalidActionType",
"when": "action_type not in {DESCRIBE, SAMPLE, QUERY, ANSWER}",
"message_template": "Unknown action type '{action_type}'. Valid types: DESCRIBE, SAMPLE, QUERY, ANSWER"
},
{
"name": "TableNotFound",
"when": "DESCRIBE or SAMPLE with table name not in database",
"message_template": "Table '{table_name}' not found. Available tables: {table_list}"
},
{
"name": "NonSelectQuery",
"when": "QUERY action with SQL that is not a SELECT statement",
"message_template": "Only SELECT queries are allowed. Got: {first_keyword}"
},
{
"name": "SQLSyntaxError",
"when": "SELECT query with invalid syntax",
"message_template": "SQL error: {sqlite3_error_message}"
},
{
"name": "QueryTimeout",
"when": "SQL execution exceeds 5 second timeout",
"message_template": "Query timed out after 5.0 seconds"
},
{
"name": "EmptyArgument",
"when": "argument field is empty or whitespace-only",
"message_template": "Argument cannot be empty for {action_type}"
},
{
"name": "DatabaseNotFound",
"when": "SQLite file not found during reset",
"message_template": "Database '{db_name}' not found in {db_dir}"
}
],
"retry_strategy": null
},
"dependencies": {
"external": [
"sqlite3 (stdlib)",
"pydantic",
"openenv (core.env_server)",
"torch"
],
"internal": [
"models.py",
"server/sql_environment.py",
"server/app.py",
"client.py",
"data/databases/models.py",
"data/questions/student_assessment.json"
]
}
}