Spaces:

laterabhi
/

SQL-Query-Env

Paused

File size: 19,737 Bytes

"""
tasks.py — SQL Query Optimization Tasks
========================================
Five tasks of increasing difficulty, each with a DuckDB-executable
"bad" query (stored in sql_query) that agents must analyze and rewrite.

All queries run against the executor's synthetic tables:
  users    (10,000 rows)  — id, email, tier, region, plan, created_at
  orders   (500,000 rows) — id, customer_id, product_id, status, total, created_at
  products (1,000 rows)   — id, name, category, price
  events   (1,000,000 rows) — id, user_id, session_id, event_type, occurred_at
"""

from typing import Any, Dict, List

TASKS: Dict[str, Dict[str, Any]] = {

    # ─────────────────────────────────────────────────────────────────
    # TASK 1 — EASY: Basic Anti-pattern Detection
    # ─────────────────────────────────────────────────────────────────
    "task_1_basic_antipatterns": {
        "task_id":   "task_1_basic_antipatterns",
        "task_name": "Basic SQL Anti-pattern Detection",
        "task_description": (
            "Analyze the SQL query below for common anti-patterns that destroy performance. "
            "Identify: SELECT * (fetches unnecessary columns from 500k rows), "
            "CAST on a filter column (prevents any index or min/max pruning), "
            "and a function applied to a date column (forces full table evaluation). "
            "For each issue report: issue_type, line, description, severity, and a concrete fix. "
            "Also provide a fully rewritten optimized_query — it will be EXECUTED against "
            "real data and your speedup will be measured."
        ),
        "difficulty": "easy",
        "dialect":   "duckdb/postgresql",
        "max_steps": 3,
        "schema_info": (
            "Table: orders (500,000 rows)\n"
            "  id INT, customer_id INT, product_id INT,\n"
            "  status VARCHAR, total DECIMAL, created_at DATE\n\n"
            "No indexes defined (DuckDB uses columnar min/max pruning when columns "
            "are not wrapped in functions).\n"
            "Scan cost: ~500k rows × all columns with SELECT *"
        ),
        "sql_query": (
            "SELECT *\n"
            "FROM orders\n"
            "WHERE CAST(customer_id AS VARCHAR) = '5000'\n"
            "  AND year(created_at) = 2024;"
        ),
        "ground_truth_issues": [
            {
                "type": "select_star",
                "line": 1,
                "keywords": [
                    "select *", "star", "all columns", "unnecessary columns",
                    "column projection", "specify columns", "bandwidth",
                ],
            },
            {
                "type": "non_sargable_cast",
                "line": 3,
                "keywords": [
                    "cast", "varchar", "type cast", "type conversion",
                    "non-sargable", "sargable", "integer comparison",
                    "string comparison", "prevents", "pruning",
                ],
            },
            {
                "type": "function_on_date_column",
                "line": 4,
                "keywords": [
                    "year(", "function on column", "non-sargable", "date range",
                    "between", "extract", "full scan", "date filter",
                ],
            },
        ],
        "approved_expected": False,
    },

    # ─────────────────────────────────────────────────────────────────
    # TASK 2 — MEDIUM: N+1 Correlated Subqueries
    # ─────────────────────────────────────────────────────────────────
    "task_2_correlated_subqueries": {
        "task_id":   "task_2_correlated_subqueries",
        "task_name": "N+1 Correlated Subquery Elimination",
        "task_description": (
            "The query below uses three correlated scalar subqueries — each one scans "
            "the entire orders table (500k rows) once per premium user (~3,300 users). "
            "That's ~10 million row reads just for aggregation. "
            "Identify the N+1 pattern, explain why each subquery is harmful, "
            "and rewrite the query as a single aggregation JOIN. "
            "Your optimized_query will be executed; results must match the original."
        ),
        "difficulty": "medium",
        "dialect":   "duckdb/postgresql",
        "max_steps": 4,
        "schema_info": (
            "Table: users (10,000 rows)\n"
            "  id INT, email VARCHAR, tier VARCHAR, region VARCHAR,\n"
            "  plan VARCHAR, created_at DATE\n\n"
            "Table: orders (500,000 rows)\n"
            "  id INT, customer_id INT, product_id INT,\n"
            "  status VARCHAR, total DECIMAL, created_at DATE\n\n"
            "Premium users: ~3,300  |  Orders per user avg: 50\n"
            "Worst-case scans: 3 subqueries × 3,300 users × 500k rows = ~5B row reads"
        ),
        "sql_query": (
            "SELECT\n"
            "    u.email,\n"
            "    u.region,\n"
            "    (SELECT COUNT(*)\n"
            "     FROM orders o\n"
            "     WHERE o.customer_id = u.id AND o.status = 'completed') AS completed_orders,\n"
            "    (SELECT SUM(o.total)\n"
            "     FROM orders o\n"
            "     WHERE o.customer_id = u.id\n"
            "       AND o.created_at >= DATE '2024-01-01') AS ytd_spend,\n"
            "    (SELECT total\n"
            "     FROM orders o\n"
            "     WHERE o.customer_id = u.id\n"
            "     ORDER BY created_at DESC LIMIT 1) AS last_order_amount\n"
            "FROM users u\n"
            "WHERE u.tier = 'premium';"
        ),
        "ground_truth_issues": [
            {
                "type": "correlated_subquery_count",
                "line": 4,
                "keywords": [
                    "correlated", "subquery", "per row", "n+1", "each user",
                    "repeated scan", "join", "aggregation",
                ],
            },
            {
                "type": "correlated_subquery_sum",
                "line": 7,
                "keywords": [
                    "correlated", "subquery", "per row", "n+1", "each user",
                    "repeated scan", "join", "group by",
                ],
            },
            {
                "type": "correlated_subquery_limit",
                "line": 11,
                "keywords": [
                    "correlated", "subquery", "limit 1", "order by", "lateral",
                    "row_number", "rank", "window function", "per row",
                ],
            },
            {
                "type": "missing_aggregation_join",
                "line": 16,
                "keywords": [
                    "left join", "group by", "aggreg", "single pass",
                    "coalesce", "join aggregat",
                ],
            },
        ],
        "approved_expected": False,
    },

    # ─────────────────────────────────────────────────────────────────
    # TASK 3 — MEDIUM-HARD: Wildcard LIKE Full Scan
    # ─────────────────────────────────────────────────────────────────
    "task_3_wildcard_scan": {
        "task_id":   "task_3_wildcard_scan",
        "task_name": "Wildcard LIKE & Projection Optimization",
        "task_description": (
            "The query scans all 1,000,000 events rows with leading and trailing wildcard "
            "LIKE patterns — these disable min/max pruning and force full column evaluation. "
            "It also computes derived columns for every row before filtering. "
            "Identify: leading-wildcard LIKE patterns that kill pruning, "
            "SELECT * on a million-row table, redundant OR conditions, "
            "and unnecessary computed columns evaluated before WHERE filtering. "
            "Rewrite to use exact equality and minimal column projection."
        ),
        "difficulty": "medium-hard",
        "dialect":   "duckdb/postgresql",
        "max_steps": 4,
        "schema_info": (
            "Table: events (1,000,000 rows)\n"
            "  id INT, user_id INT, session_id VARCHAR,\n"
            "  event_type VARCHAR, occurred_at DATE\n\n"
            "Distinct event_type values: purchase, view, click, signup, logout, search\n"
            "Wildcard LIKE on all 1M rows: forces full column scan\n"
            "Exact equality match: enables columnar zone-map pruning"
        ),
        "sql_query": (
            "SELECT\n"
            "    *,\n"
            "    CAST(id AS VARCHAR) || '_' || event_type  AS event_key,\n"
            "    upper(event_type)                          AS event_type_upper\n"
            "FROM events\n"
            "WHERE event_type LIKE '%purchase%'\n"
            "   OR event_type LIKE '%buy%'\n"
            "   OR session_id LIKE 'sess_%';"
        ),
        "ground_truth_issues": [
            {
                "type": "leading_wildcard_like",
                "line": 6,
                "keywords": [
                    "leading wildcard", "like '%", "full scan", "exact match",
                    "equality", "pruning disabled", "wildcard", "zone map",
                ],
            },
            {
                "type": "or_expands_to_full_scan",
                "line": 7,
                "keywords": [
                    "or condition", "union", "separate queries", "or expands",
                    "full scan", "like '%buy%'", "redundant",
                ],
            },
            {
                "type": "select_star_large_table",
                "line": 2,
                "keywords": [
                    "select *", "1 million", "all columns", "projection",
                    "column pruning", "unnecessary", "bandwidth",
                ],
            },
            {
                "type": "pre_filter_computed_columns",
                "line": 3,
                "keywords": [
                    "computed column", "derived", "upper(", "cast(", "concatenat",
                    "before filter", "pre-filter", "push down", "CTE",
                ],
            },
        ],
        "approved_expected": False,
    },

    # ─────────────────────────────────────────────────────────────────
    # TASK 4 — HARD: Implicit Cross Join + Repeated Scalar Subqueries
    # ─────────────────────────────────────────────────────────────────
    "task_4_implicit_join": {
        "task_id":   "task_4_implicit_join",
        "task_name": "Implicit Cross Join & Scalar Subquery Elimination",
        "task_description": (
            "This query uses comma-separated FROM (implicit cross join syntax) and "
            "two correlated scalar subqueries that re-aggregate the entire orders table "
            "once per GROUP BY group. "
            "Identify: implicit cross join risk (comma in FROM clause), "
            "two correlated scalar subqueries recalculating global stats, "
            "and the GROUP BY without an explicit JOIN. "
            "Rewrite using explicit INNER JOIN and a CTE/subquery for the global stats "
            "so they are computed exactly once."
        ),
        "difficulty": "hard",
        "dialect":   "duckdb/postgresql",
        "max_steps": 5,
        "schema_info": (
            "Table: users (10,000 rows)  — id, email, tier, region, plan, created_at\n"
            "Table: orders (500,000 rows) — id, customer_id, product_id, status, total, created_at\n\n"
            "Join: users.id = orders.customer_id\n"
            "Implicit join (comma syntax) risk: if WHERE predicate is missing,\n"
            "produces a Cartesian product of 10k × 500k = 5 BILLION rows.\n"
            "Scalar subqueries: each recalculates over all 500k orders per group."
        ),
        "sql_query": (
            "SELECT\n"
            "    u.region,\n"
            "    u.plan,\n"
            "    COUNT(*)      AS total_orders,\n"
            "    SUM(o.total)  AS revenue,\n"
            "    (SELECT AVG(total) FROM orders)                                    AS global_avg,\n"
            "    (SELECT MAX(total) FROM orders WHERE status = 'completed')         AS max_deal\n"
            "FROM users u, orders o\n"
            "WHERE u.id = o.customer_id\n"
            "  AND o.status IN ('completed', 'shipped')\n"
            "GROUP BY u.region, u.plan;"
        ),
        "ground_truth_issues": [
            {
                "type": "implicit_cross_join",
                "line": 8,
                "keywords": [
                    "implicit", "cross join", "comma join", "explicit join",
                    "inner join", "cartesian", "comma in from",
                ],
            },
            {
                "type": "repeated_scalar_subquery_avg",
                "line": 6,
                "keywords": [
                    "scalar subquery", "correlated", "per group", "once per row",
                    "cte", "with clause", "pre-compute", "global avg",
                ],
            },
            {
                "type": "repeated_scalar_subquery_max",
                "line": 7,
                "keywords": [
                    "scalar subquery", "correlated", "per group", "max deal",
                    "cte", "pre-compute", "compute once", "constant",
                ],
            },
            {
                "type": "missing_explicit_join",
                "line": 8,
                "keywords": [
                    "inner join", "explicit", "on clause", "join condition",
                    "readable", "maintainable", "ansi sql",
                ],
            },
        ],
        "approved_expected": False,
    },

    # ─────────────────────────────────────────────────────────────────
    # TASK 5 — EXPERT: Window Function Over Entire 1M-Row Table
    # ─────────────────────────────────────────────────────────────────
    "task_5_window_functions": {
        "task_id":   "task_5_window_functions",
        "task_name": "Window Function & Full-Scan Audit",
        "task_description": (
            "Five window functions are computed over ALL 1,000,000 events rows with no "
            "pre-filtering. Each OVER() clause requires a full sort or hash-aggregate pass. "
            "The global RANK() OVER (ORDER BY occurred_at) requires sorting the entire "
            "table — the most expensive operation here. "
            "Identify: no WHERE clause causing full 1M-row scans, "
            "redundant window functions that can be merged, "
            "a global ordering window function with no PARTITION, "
            "and SELECT * on the full events table. "
            "Rewrite to filter first, merge windows, and remove the global RANK."
        ),
        "difficulty": "expert",
        "dialect":   "duckdb/postgresql",
        "max_steps": 5,
        "schema_info": (
            "Table: events (1,000,000 rows)\n"
            "  id INT, user_id INT, session_id VARCHAR,\n"
            "  event_type VARCHAR, occurred_at DATE\n\n"
            "Window function cost: each OVER() = full sort/hash pass over 1M rows\n"
            "5 window functions = 5 full passes before any filtering\n"
            "Global RANK(): sorts all 1M rows globally — most expensive operation\n"
            "Filtering to 'purchase' events first reduces dataset to ~167k rows (1/6)"
        ),
        "sql_query": (
            "SELECT\n"
            "    user_id,\n"
            "    event_type,\n"
            "    occurred_at,\n"
            "    COUNT(*) OVER (PARTITION BY user_id)                                AS total_user_events,\n"
            "    COUNT(*) OVER (PARTITION BY user_id, event_type)                   AS type_count,\n"
            "    ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY occurred_at DESC) AS recency_rank,\n"
            "    RANK() OVER (ORDER BY occurred_at DESC)                            AS global_rank,\n"
            "    SUM(CASE WHEN event_type = 'purchase' THEN 1 ELSE 0 END)\n"
            "        OVER (PARTITION BY user_id)                                    AS user_purchases\n"
            "FROM events;"
        ),
        "ground_truth_issues": [
            {
                "type": "no_pre_filter",
                "line": 11,
                "keywords": [
                    "no where", "no filter", "full table", "1 million", "all rows",
                    "pre-filter", "filter first", "cte", "with clause",
                ],
            },
            {
                "type": "global_rank_no_partition",
                "line": 8,
                "keywords": [
                    "rank() over", "global rank", "no partition", "entire table",
                    "full sort", "expensive", "global ordering", "remove",
                ],
            },
            {
                "type": "redundant_window_functions",
                "line": 5,
                "keywords": [
                    "5 window", "multiple over", "redundant", "merge", "combine",
                    "single pass", "same partition", "consolidate",
                ],
            },
            {
                "type": "count_vs_conditional_sum",
                "line": 9,
                "keywords": [
                    "case when", "sum case", "count filter", "filter clause",
                    "count(*) filter", "simpler", "merge with",
                ],
            },
            {
                "type": "select_all_unfiltered",
                "line": 1,
                "keywords": [
                    "select *", "user_id, event_type", "projection", "column pruning",
                    "select specific", "1 million rows", "bandwidth",
                ],
            },
        ],
        "approved_expected": False,
    },
}


def get_task_list():
    return [
        {
            "task_id":    t["task_id"],
            "task_name":  t["task_name"],
            "difficulty": t["difficulty"],
            "max_steps":  t["max_steps"],
            "description": t["task_description"],
            "action_schema": {
                "suggestions":          "List of {issue_type, line, description, severity, fix}",
                "optimized_query":      "str — complete rewritten SQL (will be EXECUTED for real timing)",
                "summary":              "str — overall performance analysis",
                "estimated_improvement": "str — expected speedup (e.g. '10x faster')",
                "approved":             "bool — True if already optimal",
            },
        }
        for t in TASKS.values()
    ]