Spaces:

laterabhi
/

SQL-Query-Env

Paused

Abhinav Singh

feat(v2): execution-grounded rewards via DuckDB -- the key differentiator

c15d346 25 days ago

19.7 kB

	"""
	tasks.py — SQL Query Optimization Tasks
	========================================
	Five tasks of increasing difficulty, each with a DuckDB-executable
	"bad" query (stored in sql_query) that agents must analyze and rewrite.

	All queries run against the executor's synthetic tables:
	users (10,000 rows) — id, email, tier, region, plan, created_at
	orders (500,000 rows) — id, customer_id, product_id, status, total, created_at
	products (1,000 rows) — id, name, category, price
	events (1,000,000 rows) — id, user_id, session_id, event_type, occurred_at
	"""

	from typing import Any, Dict, List

	TASKS: Dict[str, Dict[str, Any]] = {

	# ─────────────────────────────────────────────────────────────────
	# TASK 1 — EASY: Basic Anti-pattern Detection
	# ─────────────────────────────────────────────────────────────────
	"task_1_basic_antipatterns": {
	"task_id": "task_1_basic_antipatterns",
	"task_name": "Basic SQL Anti-pattern Detection",
	"task_description": (
	"Analyze the SQL query below for common anti-patterns that destroy performance. "
	"Identify: SELECT * (fetches unnecessary columns from 500k rows), "
	"CAST on a filter column (prevents any index or min/max pruning), "
	"and a function applied to a date column (forces full table evaluation). "
	"For each issue report: issue_type, line, description, severity, and a concrete fix. "
	"Also provide a fully rewritten optimized_query — it will be EXECUTED against "
	"real data and your speedup will be measured."
	),
	"difficulty": "easy",
	"dialect": "duckdb/postgresql",
	"max_steps": 3,
	"schema_info": (
	"Table: orders (500,000 rows)\n"
	" id INT, customer_id INT, product_id INT,\n"
	" status VARCHAR, total DECIMAL, created_at DATE\n\n"
	"No indexes defined (DuckDB uses columnar min/max pruning when columns "
	"are not wrapped in functions).\n"
	"Scan cost: ~500k rows × all columns with SELECT *"
	),
	"sql_query": (
	"SELECT *\n"
	"FROM orders\n"
	"WHERE CAST(customer_id AS VARCHAR) = '5000'\n"
	" AND year(created_at) = 2024;"
	),
	"ground_truth_issues": [
	{
	"type": "select_star",
	"line": 1,
	"keywords": [
	"select *", "star", "all columns", "unnecessary columns",
	"column projection", "specify columns", "bandwidth",
	],
	},
	{
	"type": "non_sargable_cast",
	"line": 3,
	"keywords": [
	"cast", "varchar", "type cast", "type conversion",
	"non-sargable", "sargable", "integer comparison",
	"string comparison", "prevents", "pruning",
	],
	},
	{
	"type": "function_on_date_column",
	"line": 4,
	"keywords": [
	"year(", "function on column", "non-sargable", "date range",
	"between", "extract", "full scan", "date filter",
	],
	},
	],
	"approved_expected": False,
	},

	# ─────────────────────────────────────────────────────────────────
	# TASK 2 — MEDIUM: N+1 Correlated Subqueries
	# ─────────────────────────────────────────────────────────────────
	"task_2_correlated_subqueries": {
	"task_id": "task_2_correlated_subqueries",
	"task_name": "N+1 Correlated Subquery Elimination",
	"task_description": (
	"The query below uses three correlated scalar subqueries — each one scans "
	"the entire orders table (500k rows) once per premium user (~3,300 users). "
	"That's ~10 million row reads just for aggregation. "
	"Identify the N+1 pattern, explain why each subquery is harmful, "
	"and rewrite the query as a single aggregation JOIN. "
	"Your optimized_query will be executed; results must match the original."
	),
	"difficulty": "medium",
	"dialect": "duckdb/postgresql",
	"max_steps": 4,
	"schema_info": (
	"Table: users (10,000 rows)\n"
	" id INT, email VARCHAR, tier VARCHAR, region VARCHAR,\n"
	" plan VARCHAR, created_at DATE\n\n"
	"Table: orders (500,000 rows)\n"
	" id INT, customer_id INT, product_id INT,\n"
	" status VARCHAR, total DECIMAL, created_at DATE\n\n"
	"Premium users: ~3,300 \| Orders per user avg: 50\n"
	"Worst-case scans: 3 subqueries × 3,300 users × 500k rows = ~5B row reads"
	),
	"sql_query": (
	"SELECT\n"
	" u.email,\n"
	" u.region,\n"
	" (SELECT COUNT(*)\n"
	" FROM orders o\n"
	" WHERE o.customer_id = u.id AND o.status = 'completed') AS completed_orders,\n"
	" (SELECT SUM(o.total)\n"
	" FROM orders o\n"
	" WHERE o.customer_id = u.id\n"
	" AND o.created_at >= DATE '2024-01-01') AS ytd_spend,\n"
	" (SELECT total\n"
	" FROM orders o\n"
	" WHERE o.customer_id = u.id\n"
	" ORDER BY created_at DESC LIMIT 1) AS last_order_amount\n"
	"FROM users u\n"
	"WHERE u.tier = 'premium';"
	),
	"ground_truth_issues": [
	{
	"type": "correlated_subquery_count",
	"line": 4,
	"keywords": [
	"correlated", "subquery", "per row", "n+1", "each user",
	"repeated scan", "join", "aggregation",
	],
	},
	{
	"type": "correlated_subquery_sum",
	"line": 7,
	"keywords": [
	"correlated", "subquery", "per row", "n+1", "each user",
	"repeated scan", "join", "group by",
	],
	},
	{
	"type": "correlated_subquery_limit",
	"line": 11,
	"keywords": [
	"correlated", "subquery", "limit 1", "order by", "lateral",
	"row_number", "rank", "window function", "per row",
	],
	},
	{
	"type": "missing_aggregation_join",
	"line": 16,
	"keywords": [
	"left join", "group by", "aggreg", "single pass",
	"coalesce", "join aggregat",
	],
	},
	],
	"approved_expected": False,
	},

	# ─────────────────────────────────────────────────────────────────
	# TASK 3 — MEDIUM-HARD: Wildcard LIKE Full Scan
	# ─────────────────────────────────────────────────────────────────
	"task_3_wildcard_scan": {
	"task_id": "task_3_wildcard_scan",
	"task_name": "Wildcard LIKE & Projection Optimization",
	"task_description": (
	"The query scans all 1,000,000 events rows with leading and trailing wildcard "
	"LIKE patterns — these disable min/max pruning and force full column evaluation. "
	"It also computes derived columns for every row before filtering. "
	"Identify: leading-wildcard LIKE patterns that kill pruning, "
	"SELECT * on a million-row table, redundant OR conditions, "
	"and unnecessary computed columns evaluated before WHERE filtering. "
	"Rewrite to use exact equality and minimal column projection."
	),
	"difficulty": "medium-hard",
	"dialect": "duckdb/postgresql",
	"max_steps": 4,
	"schema_info": (
	"Table: events (1,000,000 rows)\n"
	" id INT, user_id INT, session_id VARCHAR,\n"
	" event_type VARCHAR, occurred_at DATE\n\n"
	"Distinct event_type values: purchase, view, click, signup, logout, search\n"
	"Wildcard LIKE on all 1M rows: forces full column scan\n"
	"Exact equality match: enables columnar zone-map pruning"
	),
	"sql_query": (
	"SELECT\n"
	" *,\n"
	" CAST(id AS VARCHAR) \|\| '_' \|\| event_type AS event_key,\n"
	" upper(event_type) AS event_type_upper\n"
	"FROM events\n"
	"WHERE event_type LIKE '%purchase%'\n"
	" OR event_type LIKE '%buy%'\n"
	" OR session_id LIKE 'sess_%';"
	),
	"ground_truth_issues": [
	{
	"type": "leading_wildcard_like",
	"line": 6,
	"keywords": [
	"leading wildcard", "like '%", "full scan", "exact match",
	"equality", "pruning disabled", "wildcard", "zone map",
	],
	},
	{
	"type": "or_expands_to_full_scan",
	"line": 7,
	"keywords": [
	"or condition", "union", "separate queries", "or expands",
	"full scan", "like '%buy%'", "redundant",
	],
	},
	{
	"type": "select_star_large_table",
	"line": 2,
	"keywords": [
	"select *", "1 million", "all columns", "projection",
	"column pruning", "unnecessary", "bandwidth",
	],
	},
	{
	"type": "pre_filter_computed_columns",
	"line": 3,
	"keywords": [
	"computed column", "derived", "upper(", "cast(", "concatenat",
	"before filter", "pre-filter", "push down", "CTE",
	],
	},
	],
	"approved_expected": False,
	},

	# ─────────────────────────────────────────────────────────────────
	# TASK 4 — HARD: Implicit Cross Join + Repeated Scalar Subqueries
	# ─────────────────────────────────────────────────────────────────
	"task_4_implicit_join": {
	"task_id": "task_4_implicit_join",
	"task_name": "Implicit Cross Join & Scalar Subquery Elimination",
	"task_description": (
	"This query uses comma-separated FROM (implicit cross join syntax) and "
	"two correlated scalar subqueries that re-aggregate the entire orders table "
	"once per GROUP BY group. "
	"Identify: implicit cross join risk (comma in FROM clause), "
	"two correlated scalar subqueries recalculating global stats, "
	"and the GROUP BY without an explicit JOIN. "
	"Rewrite using explicit INNER JOIN and a CTE/subquery for the global stats "
	"so they are computed exactly once."
	),
	"difficulty": "hard",
	"dialect": "duckdb/postgresql",
	"max_steps": 5,
	"schema_info": (
	"Table: users (10,000 rows) — id, email, tier, region, plan, created_at\n"
	"Table: orders (500,000 rows) — id, customer_id, product_id, status, total, created_at\n\n"
	"Join: users.id = orders.customer_id\n"
	"Implicit join (comma syntax) risk: if WHERE predicate is missing,\n"
	"produces a Cartesian product of 10k × 500k = 5 BILLION rows.\n"
	"Scalar subqueries: each recalculates over all 500k orders per group."
	),
	"sql_query": (
	"SELECT\n"
	" u.region,\n"
	" u.plan,\n"
	" COUNT(*) AS total_orders,\n"
	" SUM(o.total) AS revenue,\n"
	" (SELECT AVG(total) FROM orders) AS global_avg,\n"
	" (SELECT MAX(total) FROM orders WHERE status = 'completed') AS max_deal\n"
	"FROM users u, orders o\n"
	"WHERE u.id = o.customer_id\n"
	" AND o.status IN ('completed', 'shipped')\n"
	"GROUP BY u.region, u.plan;"
	),
	"ground_truth_issues": [
	{
	"type": "implicit_cross_join",
	"line": 8,
	"keywords": [
	"implicit", "cross join", "comma join", "explicit join",
	"inner join", "cartesian", "comma in from",
	],
	},
	{
	"type": "repeated_scalar_subquery_avg",
	"line": 6,
	"keywords": [
	"scalar subquery", "correlated", "per group", "once per row",
	"cte", "with clause", "pre-compute", "global avg",
	],
	},
	{
	"type": "repeated_scalar_subquery_max",
	"line": 7,
	"keywords": [
	"scalar subquery", "correlated", "per group", "max deal",
	"cte", "pre-compute", "compute once", "constant",
	],
	},
	{
	"type": "missing_explicit_join",
	"line": 8,
	"keywords": [
	"inner join", "explicit", "on clause", "join condition",
	"readable", "maintainable", "ansi sql",
	],
	},
	],
	"approved_expected": False,
	},

	# ─────────────────────────────────────────────────────────────────
	# TASK 5 — EXPERT: Window Function Over Entire 1M-Row Table
	# ─────────────────────────────────────────────────────────────────
	"task_5_window_functions": {
	"task_id": "task_5_window_functions",
	"task_name": "Window Function & Full-Scan Audit",
	"task_description": (
	"Five window functions are computed over ALL 1,000,000 events rows with no "
	"pre-filtering. Each OVER() clause requires a full sort or hash-aggregate pass. "
	"The global RANK() OVER (ORDER BY occurred_at) requires sorting the entire "
	"table — the most expensive operation here. "
	"Identify: no WHERE clause causing full 1M-row scans, "
	"redundant window functions that can be merged, "
	"a global ordering window function with no PARTITION, "
	"and SELECT * on the full events table. "
	"Rewrite to filter first, merge windows, and remove the global RANK."
	),
	"difficulty": "expert",
	"dialect": "duckdb/postgresql",
	"max_steps": 5,
	"schema_info": (
	"Table: events (1,000,000 rows)\n"
	" id INT, user_id INT, session_id VARCHAR,\n"
	" event_type VARCHAR, occurred_at DATE\n\n"
	"Window function cost: each OVER() = full sort/hash pass over 1M rows\n"
	"5 window functions = 5 full passes before any filtering\n"
	"Global RANK(): sorts all 1M rows globally — most expensive operation\n"
	"Filtering to 'purchase' events first reduces dataset to ~167k rows (1/6)"
	),
	"sql_query": (
	"SELECT\n"
	" user_id,\n"
	" event_type,\n"
	" occurred_at,\n"
	" COUNT(*) OVER (PARTITION BY user_id) AS total_user_events,\n"
	" COUNT(*) OVER (PARTITION BY user_id, event_type) AS type_count,\n"
	" ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY occurred_at DESC) AS recency_rank,\n"
	" RANK() OVER (ORDER BY occurred_at DESC) AS global_rank,\n"
	" SUM(CASE WHEN event_type = 'purchase' THEN 1 ELSE 0 END)\n"
	" OVER (PARTITION BY user_id) AS user_purchases\n"
	"FROM events;"
	),
	"ground_truth_issues": [
	{
	"type": "no_pre_filter",
	"line": 11,
	"keywords": [
	"no where", "no filter", "full table", "1 million", "all rows",
	"pre-filter", "filter first", "cte", "with clause",
	],
	},
	{
	"type": "global_rank_no_partition",
	"line": 8,
	"keywords": [
	"rank() over", "global rank", "no partition", "entire table",
	"full sort", "expensive", "global ordering", "remove",
	],
	},
	{
	"type": "redundant_window_functions",
	"line": 5,
	"keywords": [
	"5 window", "multiple over", "redundant", "merge", "combine",
	"single pass", "same partition", "consolidate",
	],
	},
	{
	"type": "count_vs_conditional_sum",
	"line": 9,
	"keywords": [
	"case when", "sum case", "count filter", "filter clause",
	"count(*) filter", "simpler", "merge with",
	],
	},
	{
	"type": "select_all_unfiltered",
	"line": 1,
	"keywords": [
	"select *", "user_id, event_type", "projection", "column pruning",
	"select specific", "1 million rows", "bandwidth",
	],
	},
	],
	"approved_expected": False,
	},
	}


	def get_task_list():
	return [
	{
	"task_id": t["task_id"],
	"task_name": t["task_name"],
	"difficulty": t["difficulty"],
	"max_steps": t["max_steps"],
	"description": t["task_description"],
	"action_schema": {
	"suggestions": "List of {issue_type, line, description, severity, fix}",
	"optimized_query": "str — complete rewritten SQL (will be EXECUTED for real timing)",
	"summary": "str — overall performance analysis",
	"estimated_improvement": "str — expected speedup (e.g. '10x faster')",
	"approved": "bool — True if already optimal",
	},
	}
	for t in TASKS.values()
	]