ritvik360's picture
Upload folder using huggingface_hub
a39d8ef verified
"""
nl2sql-bench/server/tasks/hard.py
===================================
Task 3 — Analytics & Window (difficulty: hard)
Questions require CTEs, window functions (RANK, ROW_NUMBER, running totals),
or non-trivial subqueries. Even strong frontier models often need 3–5 steps.
"""
from __future__ import annotations
from .base import BaseTask, TaskExample, register
@register
class AnalyticsWindowTask(BaseTask):
name = "analytics-window"
difficulty = "hard"
examples = [
TaskExample(
question=(
"Rank customers by their total spending on delivered orders "
"using DENSE_RANK (rank 1 = highest spender). "
"Return columns: customer_name, total_spent, spending_rank. "
"Round total_spent to 2 decimal places. "
"Sort by spending_rank ascending."
),
sql=(
"SELECT customer_name, total_spent, spending_rank "
"FROM ( "
" SELECT c.name AS customer_name, "
" ROUND(SUM(o.total_amount), 2) AS total_spent, "
" DENSE_RANK() OVER (ORDER BY SUM(o.total_amount) DESC) AS spending_rank "
" FROM customers c "
" JOIN orders o ON o.customer_id = c.id "
" WHERE o.status = 'delivered' "
" GROUP BY c.id, c.name "
") sub "
"ORDER BY spending_rank ASC"
),
notes="Window function DENSE_RANK inside a subquery wrapping a GROUP BY.",
),
TaskExample(
question=(
"For each product that has been reviewed, show its name, its own "
"average rating, and the average rating of all products in its category. "
"Return columns: product_name, product_avg_rating, category_avg_rating. "
"Round both averages to 2 decimal places. "
"Sort by product_avg_rating descending."
),
sql=(
"SELECT p.name AS product_name, "
" ROUND(AVG(r.rating), 2) AS product_avg_rating, "
" ROUND(AVG(AVG(r.rating)) OVER (PARTITION BY p.category_id), 2) "
" AS category_avg_rating "
"FROM products p "
"JOIN reviews r ON r.product_id = p.id "
"GROUP BY p.id, p.name, p.category_id "
"ORDER BY product_avg_rating DESC"
),
notes="AVG of AVG via window PARTITION BY — requires nested aggregate understanding.",
),
TaskExample(
question=(
"Find all customers whose most recent order has status 'cancelled'. "
"Use a CTE with ROW_NUMBER to identify the latest order per customer. "
"Return columns: customer_name, last_order_status, last_order_date. "
"Sort by customer_name ascending."
),
sql=(
"WITH ranked_orders AS ( "
" SELECT customer_id, status, created_at, "
" ROW_NUMBER() OVER (PARTITION BY customer_id "
" ORDER BY created_at DESC) AS rn "
" FROM orders "
") "
"SELECT c.name AS customer_name, "
" ro.status AS last_order_status, "
" ro.created_at AS last_order_date "
"FROM customers c "
"JOIN ranked_orders ro ON ro.customer_id = c.id "
"WHERE ro.rn = 1 "
" AND ro.status = 'cancelled' "
"ORDER BY customer_name ASC"
),
notes="CTE + ROW_NUMBER window partitioned by customer_id.",
),
TaskExample(
question=(
"Show the monthly revenue from delivered orders and its running total, "
"for all months in 2024. "
"Return columns: month (format YYYY-MM), monthly_revenue, running_total. "
"Round both revenue columns to 2 decimal places. "
"Sort by month ascending."
),
sql=(
"WITH monthly AS ( "
" SELECT strftime('%Y-%m', created_at) AS month, "
" ROUND(SUM(total_amount), 2) AS monthly_revenue "
" FROM orders "
" WHERE status = 'delivered' "
" AND created_at >= '2024-01-01' "
" AND created_at < '2025-01-01' "
" GROUP BY strftime('%Y-%m', created_at) "
") "
"SELECT month, "
" monthly_revenue, "
" ROUND(SUM(monthly_revenue) OVER (ORDER BY month), 2) AS running_total "
"FROM monthly "
"ORDER BY month ASC"
),
notes="CTE + cumulative SUM window ordered by month string.",
),
TaskExample(
question=(
"Find products whose average rating is strictly above the average "
"rating of all products in their category. "
"Return columns: product_name, category_name, "
"product_avg_rating, category_avg_rating. "
"Round both averages to 2 decimal places. "
"Sort by product_avg_rating descending, then product_name ascending."
),
sql=(
"WITH product_ratings AS ( "
" SELECT p.id AS product_id, p.name AS product_name, "
" p.category_id, c.name AS category_name, "
" ROUND(AVG(r.rating), 2) AS product_avg_rating "
" FROM products p "
" JOIN reviews r ON r.product_id = p.id "
" JOIN categories c ON c.id = p.category_id "
" GROUP BY p.id, p.name, p.category_id, c.name "
"), "
"category_ratings AS ( "
" SELECT category_id, "
" ROUND(AVG(product_avg_rating), 2) AS category_avg_rating "
" FROM product_ratings "
" GROUP BY category_id "
") "
"SELECT pr.product_name, pr.category_name, "
" pr.product_avg_rating, cr.category_avg_rating "
"FROM product_ratings pr "
"JOIN category_ratings cr ON cr.category_id = pr.category_id "
"WHERE pr.product_avg_rating > cr.category_avg_rating "
"ORDER BY pr.product_avg_rating DESC, pr.product_name ASC"
),
notes="Two CTEs, correlated comparison between product and category averages.",
),
]
def description(self) -> str:
return (
"Advanced analytics queries using CTEs, window functions "
"(DENSE_RANK, ROW_NUMBER, running SUM), and nested subqueries. "
"Tests multi-step reasoning and SQLite-specific syntax."
)