Spaces:
Paused
Paused
| """ | |
| tasks.py β SQL Query Optimization Tasks | |
| ======================================== | |
| Five tasks of increasing difficulty, each with a DuckDB-executable | |
| "bad" query (stored in sql_query) that agents must analyze and rewrite. | |
| All queries run against the executor's synthetic tables: | |
| users (10,000 rows) β id, email, tier, region, plan, created_at | |
| orders (500,000 rows) β id, customer_id, product_id, status, total, created_at | |
| products (1,000 rows) β id, name, category, price | |
| events (1,000,000 rows) β id, user_id, session_id, event_type, occurred_at | |
| """ | |
| from typing import Any, Dict, List | |
| TASKS: Dict[str, Dict[str, Any]] = { | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TASK 1 β EASY: Basic Anti-pattern Detection | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "task_1_basic_antipatterns": { | |
| "task_id": "task_1_basic_antipatterns", | |
| "task_name": "Basic SQL Anti-pattern Detection", | |
| "task_description": ( | |
| "Analyze the SQL query below for common anti-patterns that destroy performance. " | |
| "Identify: SELECT * (fetches unnecessary columns from 500k rows), " | |
| "CAST on a filter column (prevents any index or min/max pruning), " | |
| "and a function applied to a date column (forces full table evaluation). " | |
| "For each issue report: issue_type, line, description, severity, and a concrete fix. " | |
| "Also provide a fully rewritten optimized_query β it will be EXECUTED against " | |
| "real data and your speedup will be measured." | |
| ), | |
| "difficulty": "easy", | |
| "dialect": "duckdb/postgresql", | |
| "max_steps": 3, | |
| "schema_info": ( | |
| "Table: orders (500,000 rows)\n" | |
| " id INT, customer_id INT, product_id INT,\n" | |
| " status VARCHAR, total DECIMAL, created_at DATE\n\n" | |
| "No indexes defined (DuckDB uses columnar min/max pruning when columns " | |
| "are not wrapped in functions).\n" | |
| "Scan cost: ~500k rows Γ all columns with SELECT *" | |
| ), | |
| "sql_query": ( | |
| "SELECT *\n" | |
| "FROM orders\n" | |
| "WHERE CAST(customer_id AS VARCHAR) = '5000'\n" | |
| " AND year(created_at) = 2024;" | |
| ), | |
| "ground_truth_issues": [ | |
| { | |
| "type": "select_star", | |
| "line": 1, | |
| "keywords": [ | |
| "select *", "star", "all columns", "unnecessary columns", | |
| "column projection", "specify columns", "bandwidth", | |
| ], | |
| }, | |
| { | |
| "type": "non_sargable_cast", | |
| "line": 3, | |
| "keywords": [ | |
| "cast", "varchar", "type cast", "type conversion", | |
| "non-sargable", "sargable", "integer comparison", | |
| "string comparison", "prevents", "pruning", | |
| ], | |
| }, | |
| { | |
| "type": "function_on_date_column", | |
| "line": 4, | |
| "keywords": [ | |
| "year(", "function on column", "non-sargable", "date range", | |
| "between", "extract", "full scan", "date filter", | |
| ], | |
| }, | |
| ], | |
| "approved_expected": False, | |
| }, | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TASK 2 β MEDIUM: N+1 Correlated Subqueries | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "task_2_correlated_subqueries": { | |
| "task_id": "task_2_correlated_subqueries", | |
| "task_name": "N+1 Correlated Subquery Elimination", | |
| "task_description": ( | |
| "The query below uses three correlated scalar subqueries β each one scans " | |
| "the entire orders table (500k rows) once per premium user (~3,300 users). " | |
| "That's ~10 million row reads just for aggregation. " | |
| "Identify the N+1 pattern, explain why each subquery is harmful, " | |
| "and rewrite the query as a single aggregation JOIN. " | |
| "Your optimized_query will be executed; results must match the original." | |
| ), | |
| "difficulty": "medium", | |
| "dialect": "duckdb/postgresql", | |
| "max_steps": 4, | |
| "schema_info": ( | |
| "Table: users (10,000 rows)\n" | |
| " id INT, email VARCHAR, tier VARCHAR, region VARCHAR,\n" | |
| " plan VARCHAR, created_at DATE\n\n" | |
| "Table: orders (500,000 rows)\n" | |
| " id INT, customer_id INT, product_id INT,\n" | |
| " status VARCHAR, total DECIMAL, created_at DATE\n\n" | |
| "Premium users: ~3,300 | Orders per user avg: 50\n" | |
| "Worst-case scans: 3 subqueries Γ 3,300 users Γ 500k rows = ~5B row reads" | |
| ), | |
| "sql_query": ( | |
| "SELECT\n" | |
| " u.email,\n" | |
| " u.region,\n" | |
| " (SELECT COUNT(*)\n" | |
| " FROM orders o\n" | |
| " WHERE o.customer_id = u.id AND o.status = 'completed') AS completed_orders,\n" | |
| " (SELECT SUM(o.total)\n" | |
| " FROM orders o\n" | |
| " WHERE o.customer_id = u.id\n" | |
| " AND o.created_at >= DATE '2024-01-01') AS ytd_spend,\n" | |
| " (SELECT total\n" | |
| " FROM orders o\n" | |
| " WHERE o.customer_id = u.id\n" | |
| " ORDER BY created_at DESC LIMIT 1) AS last_order_amount\n" | |
| "FROM users u\n" | |
| "WHERE u.tier = 'premium';" | |
| ), | |
| "ground_truth_issues": [ | |
| { | |
| "type": "correlated_subquery_count", | |
| "line": 4, | |
| "keywords": [ | |
| "correlated", "subquery", "per row", "n+1", "each user", | |
| "repeated scan", "join", "aggregation", | |
| ], | |
| }, | |
| { | |
| "type": "correlated_subquery_sum", | |
| "line": 7, | |
| "keywords": [ | |
| "correlated", "subquery", "per row", "n+1", "each user", | |
| "repeated scan", "join", "group by", | |
| ], | |
| }, | |
| { | |
| "type": "correlated_subquery_limit", | |
| "line": 11, | |
| "keywords": [ | |
| "correlated", "subquery", "limit 1", "order by", "lateral", | |
| "row_number", "rank", "window function", "per row", | |
| ], | |
| }, | |
| { | |
| "type": "missing_aggregation_join", | |
| "line": 16, | |
| "keywords": [ | |
| "left join", "group by", "aggreg", "single pass", | |
| "coalesce", "join aggregat", | |
| ], | |
| }, | |
| ], | |
| "approved_expected": False, | |
| }, | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TASK 3 β MEDIUM-HARD: Wildcard LIKE Full Scan | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "task_3_wildcard_scan": { | |
| "task_id": "task_3_wildcard_scan", | |
| "task_name": "Wildcard LIKE & Projection Optimization", | |
| "task_description": ( | |
| "The query scans all 1,000,000 events rows with leading and trailing wildcard " | |
| "LIKE patterns β these disable min/max pruning and force full column evaluation. " | |
| "It also computes derived columns for every row before filtering. " | |
| "Identify: leading-wildcard LIKE patterns that kill pruning, " | |
| "SELECT * on a million-row table, redundant OR conditions, " | |
| "and unnecessary computed columns evaluated before WHERE filtering. " | |
| "Rewrite to use exact equality and minimal column projection." | |
| ), | |
| "difficulty": "medium-hard", | |
| "dialect": "duckdb/postgresql", | |
| "max_steps": 4, | |
| "schema_info": ( | |
| "Table: events (1,000,000 rows)\n" | |
| " id INT, user_id INT, session_id VARCHAR,\n" | |
| " event_type VARCHAR, occurred_at DATE\n\n" | |
| "Distinct event_type values: purchase, view, click, signup, logout, search\n" | |
| "Wildcard LIKE on all 1M rows: forces full column scan\n" | |
| "Exact equality match: enables columnar zone-map pruning" | |
| ), | |
| "sql_query": ( | |
| "SELECT\n" | |
| " *,\n" | |
| " CAST(id AS VARCHAR) || '_' || event_type AS event_key,\n" | |
| " upper(event_type) AS event_type_upper\n" | |
| "FROM events\n" | |
| "WHERE event_type LIKE '%purchase%'\n" | |
| " OR event_type LIKE '%buy%'\n" | |
| " OR session_id LIKE 'sess_%';" | |
| ), | |
| "ground_truth_issues": [ | |
| { | |
| "type": "leading_wildcard_like", | |
| "line": 6, | |
| "keywords": [ | |
| "leading wildcard", "like '%", "full scan", "exact match", | |
| "equality", "pruning disabled", "wildcard", "zone map", | |
| ], | |
| }, | |
| { | |
| "type": "or_expands_to_full_scan", | |
| "line": 7, | |
| "keywords": [ | |
| "or condition", "union", "separate queries", "or expands", | |
| "full scan", "like '%buy%'", "redundant", | |
| ], | |
| }, | |
| { | |
| "type": "select_star_large_table", | |
| "line": 2, | |
| "keywords": [ | |
| "select *", "1 million", "all columns", "projection", | |
| "column pruning", "unnecessary", "bandwidth", | |
| ], | |
| }, | |
| { | |
| "type": "pre_filter_computed_columns", | |
| "line": 3, | |
| "keywords": [ | |
| "computed column", "derived", "upper(", "cast(", "concatenat", | |
| "before filter", "pre-filter", "push down", "CTE", | |
| ], | |
| }, | |
| ], | |
| "approved_expected": False, | |
| }, | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TASK 4 β HARD: Implicit Cross Join + Repeated Scalar Subqueries | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "task_4_implicit_join": { | |
| "task_id": "task_4_implicit_join", | |
| "task_name": "Implicit Cross Join & Scalar Subquery Elimination", | |
| "task_description": ( | |
| "This query uses comma-separated FROM (implicit cross join syntax) and " | |
| "two correlated scalar subqueries that re-aggregate the entire orders table " | |
| "once per GROUP BY group. " | |
| "Identify: implicit cross join risk (comma in FROM clause), " | |
| "two correlated scalar subqueries recalculating global stats, " | |
| "and the GROUP BY without an explicit JOIN. " | |
| "Rewrite using explicit INNER JOIN and a CTE/subquery for the global stats " | |
| "so they are computed exactly once." | |
| ), | |
| "difficulty": "hard", | |
| "dialect": "duckdb/postgresql", | |
| "max_steps": 5, | |
| "schema_info": ( | |
| "Table: users (10,000 rows) β id, email, tier, region, plan, created_at\n" | |
| "Table: orders (500,000 rows) β id, customer_id, product_id, status, total, created_at\n\n" | |
| "Join: users.id = orders.customer_id\n" | |
| "Implicit join (comma syntax) risk: if WHERE predicate is missing,\n" | |
| "produces a Cartesian product of 10k Γ 500k = 5 BILLION rows.\n" | |
| "Scalar subqueries: each recalculates over all 500k orders per group." | |
| ), | |
| "sql_query": ( | |
| "SELECT\n" | |
| " u.region,\n" | |
| " u.plan,\n" | |
| " COUNT(*) AS total_orders,\n" | |
| " SUM(o.total) AS revenue,\n" | |
| " (SELECT AVG(total) FROM orders) AS global_avg,\n" | |
| " (SELECT MAX(total) FROM orders WHERE status = 'completed') AS max_deal\n" | |
| "FROM users u, orders o\n" | |
| "WHERE u.id = o.customer_id\n" | |
| " AND o.status IN ('completed', 'shipped')\n" | |
| "GROUP BY u.region, u.plan;" | |
| ), | |
| "ground_truth_issues": [ | |
| { | |
| "type": "implicit_cross_join", | |
| "line": 8, | |
| "keywords": [ | |
| "implicit", "cross join", "comma join", "explicit join", | |
| "inner join", "cartesian", "comma in from", | |
| ], | |
| }, | |
| { | |
| "type": "repeated_scalar_subquery_avg", | |
| "line": 6, | |
| "keywords": [ | |
| "scalar subquery", "correlated", "per group", "once per row", | |
| "cte", "with clause", "pre-compute", "global avg", | |
| ], | |
| }, | |
| { | |
| "type": "repeated_scalar_subquery_max", | |
| "line": 7, | |
| "keywords": [ | |
| "scalar subquery", "correlated", "per group", "max deal", | |
| "cte", "pre-compute", "compute once", "constant", | |
| ], | |
| }, | |
| { | |
| "type": "missing_explicit_join", | |
| "line": 8, | |
| "keywords": [ | |
| "inner join", "explicit", "on clause", "join condition", | |
| "readable", "maintainable", "ansi sql", | |
| ], | |
| }, | |
| ], | |
| "approved_expected": False, | |
| }, | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TASK 5 β EXPERT: Window Function Over Entire 1M-Row Table | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "task_5_window_functions": { | |
| "task_id": "task_5_window_functions", | |
| "task_name": "Window Function & Full-Scan Audit", | |
| "task_description": ( | |
| "Five window functions are computed over ALL 1,000,000 events rows with no " | |
| "pre-filtering. Each OVER() clause requires a full sort or hash-aggregate pass. " | |
| "The global RANK() OVER (ORDER BY occurred_at) requires sorting the entire " | |
| "table β the most expensive operation here. " | |
| "Identify: no WHERE clause causing full 1M-row scans, " | |
| "redundant window functions that can be merged, " | |
| "a global ordering window function with no PARTITION, " | |
| "and SELECT * on the full events table. " | |
| "Rewrite to filter first, merge windows, and remove the global RANK." | |
| ), | |
| "difficulty": "expert", | |
| "dialect": "duckdb/postgresql", | |
| "max_steps": 5, | |
| "schema_info": ( | |
| "Table: events (1,000,000 rows)\n" | |
| " id INT, user_id INT, session_id VARCHAR,\n" | |
| " event_type VARCHAR, occurred_at DATE\n\n" | |
| "Window function cost: each OVER() = full sort/hash pass over 1M rows\n" | |
| "5 window functions = 5 full passes before any filtering\n" | |
| "Global RANK(): sorts all 1M rows globally β most expensive operation\n" | |
| "Filtering to 'purchase' events first reduces dataset to ~167k rows (1/6)" | |
| ), | |
| "sql_query": ( | |
| "SELECT\n" | |
| " user_id,\n" | |
| " event_type,\n" | |
| " occurred_at,\n" | |
| " COUNT(*) OVER (PARTITION BY user_id) AS total_user_events,\n" | |
| " COUNT(*) OVER (PARTITION BY user_id, event_type) AS type_count,\n" | |
| " ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY occurred_at DESC) AS recency_rank,\n" | |
| " RANK() OVER (ORDER BY occurred_at DESC) AS global_rank,\n" | |
| " SUM(CASE WHEN event_type = 'purchase' THEN 1 ELSE 0 END)\n" | |
| " OVER (PARTITION BY user_id) AS user_purchases\n" | |
| "FROM events;" | |
| ), | |
| "ground_truth_issues": [ | |
| { | |
| "type": "no_pre_filter", | |
| "line": 11, | |
| "keywords": [ | |
| "no where", "no filter", "full table", "1 million", "all rows", | |
| "pre-filter", "filter first", "cte", "with clause", | |
| ], | |
| }, | |
| { | |
| "type": "global_rank_no_partition", | |
| "line": 8, | |
| "keywords": [ | |
| "rank() over", "global rank", "no partition", "entire table", | |
| "full sort", "expensive", "global ordering", "remove", | |
| ], | |
| }, | |
| { | |
| "type": "redundant_window_functions", | |
| "line": 5, | |
| "keywords": [ | |
| "5 window", "multiple over", "redundant", "merge", "combine", | |
| "single pass", "same partition", "consolidate", | |
| ], | |
| }, | |
| { | |
| "type": "count_vs_conditional_sum", | |
| "line": 9, | |
| "keywords": [ | |
| "case when", "sum case", "count filter", "filter clause", | |
| "count(*) filter", "simpler", "merge with", | |
| ], | |
| }, | |
| { | |
| "type": "select_all_unfiltered", | |
| "line": 1, | |
| "keywords": [ | |
| "select *", "user_id, event_type", "projection", "column pruning", | |
| "select specific", "1 million rows", "bandwidth", | |
| ], | |
| }, | |
| ], | |
| "approved_expected": False, | |
| }, | |
| } | |
| def get_task_list(): | |
| return [ | |
| { | |
| "task_id": t["task_id"], | |
| "task_name": t["task_name"], | |
| "difficulty": t["difficulty"], | |
| "max_steps": t["max_steps"], | |
| "description": t["task_description"], | |
| "action_schema": { | |
| "suggestions": "List of {issue_type, line, description, severity, fix}", | |
| "optimized_query": "str β complete rewritten SQL (will be EXECUTED for real timing)", | |
| "summary": "str β overall performance analysis", | |
| "estimated_improvement": "str β expected speedup (e.g. '10x faster')", | |
| "approved": "bool β True if already optimal", | |
| }, | |
| } | |
| for t in TASKS.values() | |
| ] | |