File size: 7,110 Bytes
a39d8ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
"""
nl2sql-bench/server/tasks/hard.py
===================================
Task 3 — Analytics & Window  (difficulty: hard)

Questions require CTEs, window functions (RANK, ROW_NUMBER, running totals),
or non-trivial subqueries. Even strong frontier models often need 3–5 steps.
"""

from __future__ import annotations

from .base import BaseTask, TaskExample, register


@register
class AnalyticsWindowTask(BaseTask):
    name = "analytics-window"
    difficulty = "hard"

    examples = [
        TaskExample(
            question=(
                "Rank customers by their total spending on delivered orders "
                "using DENSE_RANK (rank 1 = highest spender). "
                "Return columns: customer_name, total_spent, spending_rank. "
                "Round total_spent to 2 decimal places. "
                "Sort by spending_rank ascending."
            ),
            sql=(
                "SELECT customer_name, total_spent, spending_rank "
                "FROM ( "
                "  SELECT c.name AS customer_name, "
                "         ROUND(SUM(o.total_amount), 2) AS total_spent, "
                "         DENSE_RANK() OVER (ORDER BY SUM(o.total_amount) DESC) AS spending_rank "
                "  FROM customers c "
                "  JOIN orders o ON o.customer_id = c.id "
                "  WHERE o.status = 'delivered' "
                "  GROUP BY c.id, c.name "
                ") sub "
                "ORDER BY spending_rank ASC"
            ),
            notes="Window function DENSE_RANK inside a subquery wrapping a GROUP BY.",
        ),
        TaskExample(
            question=(
                "For each product that has been reviewed, show its name, its own "
                "average rating, and the average rating of all products in its category. "
                "Return columns: product_name, product_avg_rating, category_avg_rating. "
                "Round both averages to 2 decimal places. "
                "Sort by product_avg_rating descending."
            ),
            sql=(
                "SELECT p.name AS product_name, "
                "       ROUND(AVG(r.rating), 2) AS product_avg_rating, "
                "       ROUND(AVG(AVG(r.rating)) OVER (PARTITION BY p.category_id), 2) "
                "           AS category_avg_rating "
                "FROM products p "
                "JOIN reviews r ON r.product_id = p.id "
                "GROUP BY p.id, p.name, p.category_id "
                "ORDER BY product_avg_rating DESC"
            ),
            notes="AVG of AVG via window PARTITION BY — requires nested aggregate understanding.",
        ),
        TaskExample(
            question=(
                "Find all customers whose most recent order has status 'cancelled'. "
                "Use a CTE with ROW_NUMBER to identify the latest order per customer. "
                "Return columns: customer_name, last_order_status, last_order_date. "
                "Sort by customer_name ascending."
            ),
            sql=(
                "WITH ranked_orders AS ( "
                "  SELECT customer_id, status, created_at, "
                "         ROW_NUMBER() OVER (PARTITION BY customer_id "
                "                           ORDER BY created_at DESC) AS rn "
                "  FROM orders "
                ") "
                "SELECT c.name AS customer_name, "
                "       ro.status AS last_order_status, "
                "       ro.created_at AS last_order_date "
                "FROM customers c "
                "JOIN ranked_orders ro ON ro.customer_id = c.id "
                "WHERE ro.rn = 1 "
                "  AND ro.status = 'cancelled' "
                "ORDER BY customer_name ASC"
            ),
            notes="CTE + ROW_NUMBER window partitioned by customer_id.",
        ),
        TaskExample(
            question=(
                "Show the monthly revenue from delivered orders and its running total, "
                "for all months in 2024. "
                "Return columns: month (format YYYY-MM), monthly_revenue, running_total. "
                "Round both revenue columns to 2 decimal places. "
                "Sort by month ascending."
            ),
            sql=(
                "WITH monthly AS ( "
                "  SELECT strftime('%Y-%m', created_at) AS month, "
                "         ROUND(SUM(total_amount), 2) AS monthly_revenue "
                "  FROM orders "
                "  WHERE status = 'delivered' "
                "    AND created_at >= '2024-01-01' "
                "    AND created_at < '2025-01-01' "
                "  GROUP BY strftime('%Y-%m', created_at) "
                ") "
                "SELECT month, "
                "       monthly_revenue, "
                "       ROUND(SUM(monthly_revenue) OVER (ORDER BY month), 2) AS running_total "
                "FROM monthly "
                "ORDER BY month ASC"
            ),
            notes="CTE + cumulative SUM window ordered by month string.",
        ),
        TaskExample(
            question=(
                "Find products whose average rating is strictly above the average "
                "rating of all products in their category. "
                "Return columns: product_name, category_name, "
                "product_avg_rating, category_avg_rating. "
                "Round both averages to 2 decimal places. "
                "Sort by product_avg_rating descending, then product_name ascending."
            ),
            sql=(
                "WITH product_ratings AS ( "
                "  SELECT p.id AS product_id, p.name AS product_name, "
                "         p.category_id, c.name AS category_name, "
                "         ROUND(AVG(r.rating), 2) AS product_avg_rating "
                "  FROM products p "
                "  JOIN reviews r ON r.product_id = p.id "
                "  JOIN categories c ON c.id = p.category_id "
                "  GROUP BY p.id, p.name, p.category_id, c.name "
                "), "
                "category_ratings AS ( "
                "  SELECT category_id, "
                "         ROUND(AVG(product_avg_rating), 2) AS category_avg_rating "
                "  FROM product_ratings "
                "  GROUP BY category_id "
                ") "
                "SELECT pr.product_name, pr.category_name, "
                "       pr.product_avg_rating, cr.category_avg_rating "
                "FROM product_ratings pr "
                "JOIN category_ratings cr ON cr.category_id = pr.category_id "
                "WHERE pr.product_avg_rating > cr.category_avg_rating "
                "ORDER BY pr.product_avg_rating DESC, pr.product_name ASC"
            ),
            notes="Two CTEs, correlated comparison between product and category averages.",
        ),
    ]

    def description(self) -> str:
        return (
            "Advanced analytics queries using CTEs, window functions "
            "(DENSE_RANK, ROW_NUMBER, running SUM), and nested subqueries. "
            "Tests multi-step reasoning and SQLite-specific syntax."
        )