Spaces:

hjerpe
/

sql_env

Sleeping

App Files Files Community

sql_env / specs /F001-VERIFICATION_INPUT.json

hjerpe

Upload folder using huggingface_hub

5dd1bb4 verified 3 months ago

Raw

History Blame Contribute Delete

14.6 kB

	{
	"$schema": "autocode-verification-input-v1",
	"feature_id": "F001",
	"spec_path": "specs/F001-IMPLEMENTATION_SPEC.md",
	"generated": "2026-03-24T12:00:00Z",
	"verification_mode": "mvp",

	"overview": {
	"summary": "Complete the step/reset lifecycle so the SQL environment actually executes SQL queries against real Spider SQLite databases. Replace the non-functional Ollama-based action interpretation with structured actions (DESCRIBE, SAMPLE, QUERY, ANSWER) that the agent provides directly. Implement sandboxed SQL execution (read-only, SELECT-only, 5s timeout, 20-row truncation), question loading from Spider JSON, per-episode state management via EpisodeContext, and a 15-step budget.",
	"goal": "Enable agents to play complete RL episodes: reset with a random question, explore a hidden schema via DESCRIBE/SAMPLE, run SQL queries, and submit answers against real databases."
	},

	"interfaces": {
	"types": [
	{
	"name": "SQLAction",
	"fields": [
	{"name": "action_type", "type": "str", "description": "One of: DESCRIBE, SAMPLE, QUERY, ANSWER"},
	{"name": "argument", "type": "str", "description": "Table name (DESCRIBE/SAMPLE), SQL string (QUERY), or answer value (ANSWER)"}
	],
	"description": "Structured action from agent to environment. Extends openenv Action base."
	},
	{
	"name": "SQLObservation",
	"fields": [
	{"name": "done", "type": "bool", "description": "Whether the episode has ended"},
	{"name": "reward", "type": "float \| None", "description": "Reward signal (set on terminal step)"},
	{"name": "question", "type": "str", "description": "The NL question to answer"},
	{"name": "schema_info", "type": "str", "description": "Known schema info (table names initially, columns added after DESCRIBE)"},
	{"name": "result", "type": "str", "description": "Result of last action (truncated to 20 rows)"},
	{"name": "error", "type": "str", "description": "Error message if action failed, empty string otherwise"},
	{"name": "step_count", "type": "int", "description": "Current step number (0-indexed)"},
	{"name": "budget_remaining", "type": "int", "description": "Steps left before forced termination"},
	{"name": "action_history", "type": "list[str]", "description": "Summary of previous actions taken"}
	],
	"description": "Rich observation from environment to agent. Extends openenv Observation base."
	},
	{
	"name": "QuestionRecord",
	"fields": [
	{"name": "question_id", "type": "str", "description": "Unique identifier for the question"},
	{"name": "question_text", "type": "str", "description": "Natural language question"},
	{"name": "database_name", "type": "str", "description": "Which SQLite database to load (matches db_id)"},
	{"name": "gold_sql", "type": "str", "description": "Reference SQL query (hidden from agent)"},
	{"name": "gold_answer", "type": "str", "description": "Expected answer (hidden from agent)"},
	{"name": "answer_type", "type": "str", "description": "One of: integer, float, string, list"},
	{"name": "difficulty", "type": "str", "description": "One of: easy, medium, hard"},
	{"name": "tables_involved", "type": "list[str]", "description": "Tables referenced by gold query"}
	],
	"description": "Metadata for a single question from the Spider dataset. Server-side only."
	},
	{
	"name": "EpisodeContext",
	"fields": [
	{"name": "episode_id", "type": "str", "description": "Unique episode identifier"},
	{"name": "db_connection", "type": "sqlite3.Connection", "description": "Read-only connection to episode database"},
	{"name": "question_record", "type": "QuestionRecord", "description": "The selected question for this episode"},
	{"name": "step_count", "type": "int", "description": "Current step number"},
	{"name": "budget", "type": "int", "description": "Steps remaining (default 15)"},
	{"name": "described_tables", "type": "set[str]", "description": "Tables the agent has DESCRIBEd"},
	{"name": "action_log", "type": "list[str]", "description": "Human-readable action summaries"},
	{"name": "done", "type": "bool", "description": "Whether the episode has ended"},
	{"name": "gold_answer", "type": "str \| None", "description": "Computed at reset by running gold_sql"}
	],
	"description": "Per-episode server-side state. Never sent to agent."
	}
	],
	"functions": [
	{
	"name": "SQLEnvironment.__init__",
	"params": [
	{"name": "questions_path", "type": "str", "description": "Path to Spider questions JSON file"},
	{"name": "db_dir", "type": "str", "description": "Directory containing Spider SQLite database files"},
	{"name": "tokenizer", "type": "ModelTokenizer", "description": "OpenEnv tokenizer for compatibility"},
	{"name": "step_budget", "type": "int", "default": "15", "description": "Maximum steps per episode"}
	],
	"returns": "None",
	"raises": ["FileNotFoundError", "ValueError"],
	"description": "Initialize environment with question dataset and database directory. Loads questions at init time."
	},
	{
	"name": "SQLEnvironment.reset",
	"params": [
	{"name": "seed", "type": "int \| None", "default": "None", "description": "Random seed for question selection"},
	{"name": "episode_id", "type": "str \| None", "default": "None", "description": "Optional episode identifier"}
	],
	"returns": "SQLObservation",
	"raises": ["FileNotFoundError"],
	"description": "Pick random question, open read-only SQLite, compute gold answer, return initial observation with question text and table names."
	},
	{
	"name": "SQLEnvironment.step",
	"params": [
	{"name": "action", "type": "SQLAction", "description": "Structured action with action_type and argument"},
	{"name": "timeout_s", "type": "float", "default": "30", "description": "Overall step timeout"}
	],
	"returns": "SQLObservation",
	"raises": [],
	"description": "Dispatch action to handler, update episode context, enforce budget, return observation. Never raises -- errors are in observation.error field."
	},
	{
	"name": "SQLEnvironment._execute_sql",
	"params": [
	{"name": "sql", "type": "str", "description": "SQL query to execute"},
	{"name": "timeout_s", "type": "float", "default": "5.0", "description": "Maximum execution time"}
	],
	"returns": "list[tuple]",
	"raises": ["ValueError", "sqlite3.OperationalError"],
	"description": "Sandboxed SQL execution with SELECT-only validation, read-only connection, timeout via progress_handler, and result truncation."
	},
	{
	"name": "SQLEnvironment._handle_describe",
	"params": [
	{"name": "table_name", "type": "str", "description": "Name of table to describe"}
	],
	"returns": "str",
	"description": "Return column names, types, and row count for a table. Returns error string if table not found, listing available tables."
	},
	{
	"name": "SQLEnvironment._handle_sample",
	"params": [
	{"name": "table_name", "type": "str", "description": "Name of table to sample"},
	{"name": "limit", "type": "int", "default": "5", "description": "Number of rows to return"}
	],
	"returns": "str",
	"description": "Execute SELECT * FROM table LIMIT N via _execute_sql, return formatted rows."
	},
	{
	"name": "SQLEnvironment._handle_query",
	"params": [
	{"name": "sql", "type": "str", "description": "SQL SELECT query to execute"}
	],
	"returns": "str",
	"description": "Validate SELECT-only, execute with 5s timeout, format results, truncate to 20 rows with indicator."
	},
	{
	"name": "SQLEnvironment._handle_answer",
	"params": [
	{"name": "value", "type": "str", "description": "Agent's answer string"}
	],
	"returns": "tuple[bool, float]",
	"description": "Compare to gold answer (case-insensitive string comparison for MVP). Returns (is_correct, reward). Sets episode done=True."
	},
	{
	"name": "SQLEnvironment._build_observation",
	"params": [],
	"returns": "SQLObservation",
	"description": "Construct rich SQLObservation from current EpisodeContext state."
	},
	{
	"name": "SQLEnvironment._load_questions",
	"params": [
	{"name": "path", "type": "str", "description": "Path to questions JSON file"}
	],
	"returns": "list[QuestionRecord]",
	"raises": ["FileNotFoundError", "ValueError"],
	"description": "Load Spider question JSON and parse into QuestionRecord list."
	},
	{
	"name": "SQLEnvironment._open_db",
	"params": [
	{"name": "db_name", "type": "str", "description": "Database name (matches db_id in questions)"}
	],
	"returns": "sqlite3.Connection",
	"raises": ["FileNotFoundError"],
	"description": "Open read-only SQLite connection using URI file:{path}?mode=ro."
	}
	],
	"api_endpoints": [
	{
	"method": "POST",
	"path": "/reset",
	"request_body": {
	"type": "object",
	"fields": ["seed: int \| null", "episode_id: str \| null"]
	},
	"response_body": {
	"type": "SQLObservation"
	},
	"errors": [
	{"status": 500, "when": "Database file not found or questions file missing"}
	]
	},
	{
	"method": "POST",
	"path": "/step",
	"request_body": {
	"type": "SQLAction",
	"fields": ["action_type: str", "argument: str"]
	},
	"response_body": {
	"type": "SQLObservation"
	},
	"errors": [
	{"status": 422, "when": "Invalid action schema (missing action_type or argument)"}
	]
	}
	]
	},

	"data_flow": {
	"primary_flow": [
	"Agent calls POST /reset to start a new episode",
	"Environment picks a random QuestionRecord from loaded questions",
	"Environment opens read-only SQLite connection for the question's database",
	"Environment executes gold_sql to compute gold_answer (stored server-side)",
	"Environment creates EpisodeContext with step_count=0, budget=15",
	"Environment returns SQLObservation with question text and table names (columns hidden)",
	"Agent calls POST /step with SQLAction (DESCRIBE/SAMPLE/QUERY/ANSWER)",
	"Environment dispatches to appropriate handler based on action_type",
	"Handler executes against SQLite (DESCRIBE/SAMPLE/QUERY) or compares answer (ANSWER)",
	"Environment updates EpisodeContext: step_count++, budget-- (except ANSWER)",
	"Environment checks budget exhaustion and sets done=True if budget==0",
	"Environment returns SQLObservation with result/error, updated budget, action_history"
	],
	"alternative_flows": [
	{
	"name": "ANSWER submission",
	"trigger": "Agent sends action_type=ANSWER",
	"steps": [
	"Compare argument to gold_answer (case-insensitive, stripped)",
	"Set done=True, reward=1.0 (correct) or 0.0 (incorrect)",
	"Do NOT decrement budget",
	"Return terminal observation"
	]
	},
	{
	"name": "Budget exhaustion",
	"trigger": "Budget reaches 0 after a DESCRIBE/SAMPLE/QUERY step",
	"steps": [
	"Set done=True, reward=0.0",
	"Return terminal observation with done=True"
	]
	},
	{
	"name": "Invalid SQL",
	"trigger": "Agent sends non-SELECT query or malformed SQL",
	"steps": [
	"Reject at SELECT-only validation or catch sqlite3 error",
	"Set observation.error with descriptive message",
	"Step still counts against budget",
	"Return observation with error field populated"
	]
	},
	{
	"name": "Query timeout",
	"trigger": "SQL execution exceeds 5 seconds",
	"steps": [
	"Interrupt query via sqlite3 progress_handler",
	"Set observation.error to timeout message",
	"Step counts against budget"
	]
	},
	{
	"name": "Table not found",
	"trigger": "DESCRIBE/SAMPLE with nonexistent table name",
	"steps": [
	"Return error listing available table names",
	"Step counts against budget"
	]
	}
	]
	},

	"error_handling": {
	"error_types": [
	{
	"name": "InvalidActionType",
	"when": "action_type not in {DESCRIBE, SAMPLE, QUERY, ANSWER}",
	"message_template": "Unknown action type '{action_type}'. Valid types: DESCRIBE, SAMPLE, QUERY, ANSWER"
	},
	{
	"name": "TableNotFound",
	"when": "DESCRIBE or SAMPLE with table name not in database",
	"message_template": "Table '{table_name}' not found. Available tables: {table_list}"
	},
	{
	"name": "NonSelectQuery",
	"when": "QUERY action with SQL that is not a SELECT statement",
	"message_template": "Only SELECT queries are allowed. Got: {first_keyword}"
	},
	{
	"name": "SQLSyntaxError",
	"when": "SELECT query with invalid syntax",
	"message_template": "SQL error: {sqlite3_error_message}"
	},
	{
	"name": "QueryTimeout",
	"when": "SQL execution exceeds 5 second timeout",
	"message_template": "Query timed out after 5.0 seconds"
	},
	{
	"name": "EmptyArgument",
	"when": "argument field is empty or whitespace-only",
	"message_template": "Argument cannot be empty for {action_type}"
	},
	{
	"name": "DatabaseNotFound",
	"when": "SQLite file not found during reset",
	"message_template": "Database '{db_name}' not found in {db_dir}"
	}
	],
	"retry_strategy": null
	},

	"dependencies": {
	"external": [
	"sqlite3 (stdlib)",
	"pydantic",
	"openenv (core.env_server)",
	"torch"
	],
	"internal": [
	"models.py",
	"server/sql_environment.py",
	"server/app.py",
	"client.py",
	"data/databases/models.py",
	"data/questions/student_assessment.json"
	]
	}
	}