Spaces:
Running
Running
| """ | |
| SQLArenaEnv Task Library. | |
| 50 hand-crafted SQL tasks across 4 difficulty tiers. | |
| Each task includes: | |
| - A SQLite schema (CREATE TABLE statements) | |
| - Seed data (INSERT statements) | |
| - A natural language question | |
| - A validator function that checks agent's result against correct answer | |
| Difficulty tiers: | |
| EASY (10 tasks) β single table, basic SELECT/WHERE/ORDER BY | |
| MEDIUM (15 tasks) β JOINs, GROUP BY, HAVING, subqueries | |
| HARD (15 tasks) β window functions, CTEs, multi-join, aggregation | |
| EXPERT (10 tasks) β correlated subqueries, recursive CTEs, complex analytics | |
| """ | |
| from typing import Any, Callable, Dict, List, Optional | |
| from dataclasses import dataclass, field | |
| class SQLTask: | |
| task_id: str | |
| difficulty: str # easy / medium / hard / expert | |
| question: str | |
| schema_sql: str # CREATE TABLE statements | |
| seed_sql: str # INSERT statements | |
| solution_sql: str # Reference solution (for generating expected answer) | |
| schema_description: str # Human-readable schema hint for agent | |
| validator: Optional[Callable] = None # custom validator, None = use default row comparison | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # EASY TASKS β single table operations | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| EASY_TASKS = [ | |
| SQLTask( | |
| task_id="easy_001", | |
| difficulty="easy", | |
| question="List all employees whose salary is greater than 70000, ordered by salary descending.", | |
| schema_sql=""" | |
| CREATE TABLE employees ( | |
| id INTEGER PRIMARY KEY, | |
| name TEXT NOT NULL, | |
| department TEXT, | |
| salary REAL, | |
| hire_date TEXT | |
| );""", | |
| seed_sql=""" | |
| INSERT INTO employees VALUES | |
| (1,'Alice','Engineering',95000,'2020-01-15'), | |
| (2,'Bob','Marketing',55000,'2019-03-20'), | |
| (3,'Charlie','Engineering',82000,'2021-06-01'), | |
| (4,'Diana','HR',47000,'2018-11-10'), | |
| (5,'Eve','Engineering',110000,'2017-08-25'), | |
| (6,'Frank','Marketing',72000,'2022-02-14'), | |
| (7,'Grace','HR',68000,'2020-09-30'), | |
| (8,'Hank','Engineering',78000,'2019-12-05');""", | |
| solution_sql="SELECT id, name, department, salary FROM employees WHERE salary > 70000 ORDER BY salary DESC;", | |
| schema_description="employees(id, name, department, salary, hire_date)", | |
| ), | |
| SQLTask( | |
| task_id="easy_002", | |
| difficulty="easy", | |
| question="How many products are there in each category? Show category and count, ordered by count descending.", | |
| schema_sql=""" | |
| CREATE TABLE products ( | |
| product_id INTEGER PRIMARY KEY, | |
| name TEXT, | |
| category TEXT, | |
| price REAL, | |
| stock INTEGER | |
| );""", | |
| seed_sql=""" | |
| INSERT INTO products VALUES | |
| (1,'Laptop','Electronics',999.99,50), | |
| (2,'Phone','Electronics',699.99,120), | |
| (3,'Desk','Furniture',249.99,30), | |
| (4,'Chair','Furniture',189.99,75), | |
| (5,'Tablet','Electronics',449.99,60), | |
| (6,'Bookshelf','Furniture',129.99,40), | |
| (7,'Headphones','Electronics',149.99,200), | |
| (8,'Lamp','Furniture',59.99,90), | |
| (9,'Monitor','Electronics',399.99,45), | |
| (10,'Keyboard','Electronics',79.99,150);""", | |
| solution_sql="SELECT category, COUNT(*) as count FROM products GROUP BY category ORDER BY count DESC;", | |
| schema_description="products(product_id, name, category, price, stock)", | |
| ), | |
| SQLTask( | |
| task_id="easy_003", | |
| difficulty="easy", | |
| question="Find the top 5 most expensive products. Show name and price.", | |
| schema_sql=""" | |
| CREATE TABLE products ( | |
| product_id INTEGER PRIMARY KEY, | |
| name TEXT, | |
| category TEXT, | |
| price REAL, | |
| stock INTEGER | |
| );""", | |
| seed_sql=""" | |
| INSERT INTO products VALUES | |
| (1,'Laptop','Electronics',999.99,50), | |
| (2,'Phone','Electronics',699.99,120), | |
| (3,'Desk','Furniture',249.99,30), | |
| (4,'Chair','Furniture',189.99,75), | |
| (5,'Tablet','Electronics',449.99,60), | |
| (6,'Bookshelf','Furniture',129.99,40), | |
| (7,'Headphones','Electronics',149.99,200), | |
| (8,'Lamp','Furniture',59.99,90), | |
| (9,'Monitor','Electronics',399.99,45), | |
| (10,'Keyboard','Electronics',79.99,150);""", | |
| solution_sql="SELECT name, price FROM products ORDER BY price DESC LIMIT 5;", | |
| schema_description="products(product_id, name, category, price, stock)", | |
| ), | |
| SQLTask( | |
| task_id="easy_004", | |
| difficulty="easy", | |
| question="Find all students who scored above 85 in Mathematics. Show name and score.", | |
| schema_sql=""" | |
| CREATE TABLE students ( | |
| student_id INTEGER PRIMARY KEY, | |
| name TEXT, | |
| subject TEXT, | |
| score INTEGER, | |
| grade TEXT | |
| );""", | |
| seed_sql=""" | |
| INSERT INTO students VALUES | |
| (1,'Arjun','Mathematics',92,'A'), | |
| (2,'Priya','Mathematics',78,'B'), | |
| (3,'Rahul','Science',88,'A'), | |
| (4,'Sneha','Mathematics',95,'A'), | |
| (5,'Vikram','Science',72,'C'), | |
| (6,'Ananya','Mathematics',65,'D'), | |
| (7,'Karan','Mathematics',89,'A'), | |
| (8,'Divya','Science',91,'A'), | |
| (9,'Rohit','Mathematics',55,'F'), | |
| (10,'Meera','Mathematics',87,'A');""", | |
| solution_sql="SELECT name, score FROM students WHERE subject='Mathematics' AND score > 85 ORDER BY score DESC;", | |
| schema_description="students(student_id, name, subject, score, grade)", | |
| ), | |
| SQLTask( | |
| task_id="easy_005", | |
| difficulty="easy", | |
| question="What is the total revenue (price * quantity) from all orders? Return a single number called total_revenue.", | |
| schema_sql=""" | |
| CREATE TABLE orders ( | |
| order_id INTEGER PRIMARY KEY, | |
| customer TEXT, | |
| product TEXT, | |
| price REAL, | |
| quantity INTEGER, | |
| order_date TEXT | |
| );""", | |
| seed_sql=""" | |
| INSERT INTO orders VALUES | |
| (1,'Alice','Laptop',999.99,2,'2024-01-10'), | |
| (2,'Bob','Phone',699.99,1,'2024-01-11'), | |
| (3,'Charlie','Tablet',449.99,3,'2024-01-12'), | |
| (4,'Diana','Headphones',149.99,5,'2024-01-13'), | |
| (5,'Eve','Monitor',399.99,1,'2024-01-14'), | |
| (6,'Frank','Keyboard',79.99,4,'2024-01-15'), | |
| (7,'Grace','Laptop',999.99,1,'2024-01-16'), | |
| (8,'Hank','Phone',699.99,2,'2024-01-17');""", | |
| solution_sql="SELECT ROUND(SUM(price * quantity), 2) as total_revenue FROM orders;", | |
| schema_description="orders(order_id, customer, product, price, quantity, order_date)", | |
| ), | |
| SQLTask( | |
| task_id="easy_006", | |
| difficulty="easy", | |
| question="List all cities that have more than 1 million population, ordered alphabetically.", | |
| schema_sql=""" | |
| CREATE TABLE cities ( | |
| city_id INTEGER PRIMARY KEY, | |
| name TEXT, | |
| country TEXT, | |
| population INTEGER, | |
| area_km2 REAL | |
| );""", | |
| seed_sql=""" | |
| INSERT INTO cities VALUES | |
| (1,'Mumbai','India',20667656,603.4), | |
| (2,'Delhi','India',32941000,1484.0), | |
| (3,'Kolkata','India',14850000,206.1), | |
| (4,'Chennai','India',10971000,426.0), | |
| (5,'Pune','India',3124000,331.3), | |
| (6,'Hyderabad','India',10534000,650.0), | |
| (7,'Ahmedabad','India',8450000,475.0), | |
| (8,'Jaipur','India',3766000,467.0), | |
| (9,'Surat','India',6936000,326.5), | |
| (10,'Visakhapatnam','India',2035000,681.96);""", | |
| solution_sql="SELECT name, population FROM cities WHERE population > 1000000 ORDER BY name ASC;", | |
| schema_description="cities(city_id, name, country, population, area_km2)", | |
| ), | |
| SQLTask( | |
| task_id="easy_007", | |
| difficulty="easy", | |
| question="Find the average salary by department. Show department and avg_salary rounded to 2 decimals.", | |
| schema_sql=""" | |
| CREATE TABLE employees ( | |
| id INTEGER PRIMARY KEY, | |
| name TEXT NOT NULL, | |
| department TEXT, | |
| salary REAL, | |
| hire_date TEXT | |
| );""", | |
| seed_sql=""" | |
| INSERT INTO employees VALUES | |
| (1,'Alice','Engineering',95000,'2020-01-15'), | |
| (2,'Bob','Marketing',55000,'2019-03-20'), | |
| (3,'Charlie','Engineering',82000,'2021-06-01'), | |
| (4,'Diana','HR',47000,'2018-11-10'), | |
| (5,'Eve','Engineering',110000,'2017-08-25'), | |
| (6,'Frank','Marketing',72000,'2022-02-14'), | |
| (7,'Grace','HR',68000,'2020-09-30'), | |
| (8,'Hank','Engineering',78000,'2019-12-05'), | |
| (9,'Ivy','Marketing',61000,'2021-03-15'), | |
| (10,'Jack','HR',52000,'2022-07-01');""", | |
| solution_sql="SELECT department, ROUND(AVG(salary),2) as avg_salary FROM employees GROUP BY department ORDER BY avg_salary DESC;", | |
| schema_description="employees(id, name, department, salary, hire_date)", | |
| ), | |
| SQLTask( | |
| task_id="easy_008", | |
| difficulty="easy", | |
| question="Find all books published after 2010 with rating above 4.0. Show title, author, and rating.", | |
| schema_sql=""" | |
| CREATE TABLE books ( | |
| book_id INTEGER PRIMARY KEY, | |
| title TEXT, | |
| author TEXT, | |
| year INTEGER, | |
| rating REAL, | |
| genre TEXT | |
| );""", | |
| seed_sql=""" | |
| INSERT INTO books VALUES | |
| (1,'The Pragmatic Programmer','Hunt & Thomas',1999,4.8,'Tech'), | |
| (2,'Clean Code','Robert Martin',2008,4.6,'Tech'), | |
| (3,'Thinking Fast and Slow','Daniel Kahneman',2011,4.5,'Psychology'), | |
| (4,'Atomic Habits','James Clear',2018,4.8,'Self-Help'), | |
| (5,'Deep Work','Cal Newport',2016,4.4,'Self-Help'), | |
| (6,'The Lean Startup','Eric Ries',2011,4.2,'Business'), | |
| (7,'Zero to One','Peter Thiel',2014,4.3,'Business'), | |
| (8,'Dune','Frank Herbert',1965,4.7,'Fiction'), | |
| (9,'Project Hail Mary','Andy Weir',2021,4.9,'Fiction'), | |
| (10,'The Midnight Library','Matt Haig',2020,4.2,'Fiction');""", | |
| solution_sql="SELECT title, author, rating FROM books WHERE year > 2010 AND rating > 4.0 ORDER BY rating DESC;", | |
| schema_description="books(book_id, title, author, year, rating, genre)", | |
| ), | |
| SQLTask( | |
| task_id="easy_009", | |
| difficulty="easy", | |
| question="Count how many employees were hired each year. Show hire_year and count, ordered by year.", | |
| schema_sql=""" | |
| CREATE TABLE employees ( | |
| id INTEGER PRIMARY KEY, | |
| name TEXT NOT NULL, | |
| department TEXT, | |
| salary REAL, | |
| hire_date TEXT | |
| );""", | |
| seed_sql=""" | |
| INSERT INTO employees VALUES | |
| (1,'Alice','Engineering',95000,'2020-01-15'), | |
| (2,'Bob','Marketing',55000,'2019-03-20'), | |
| (3,'Charlie','Engineering',82000,'2021-06-01'), | |
| (4,'Diana','HR',47000,'2018-11-10'), | |
| (5,'Eve','Engineering',110000,'2017-08-25'), | |
| (6,'Frank','Marketing',72000,'2022-02-14'), | |
| (7,'Grace','HR',68000,'2020-09-30'), | |
| (8,'Hank','Engineering',78000,'2019-12-05'), | |
| (9,'Ivy','Marketing',61000,'2021-03-15'), | |
| (10,'Jack','HR',52000,'2022-07-01');""", | |
| solution_sql="SELECT SUBSTR(hire_date,1,4) as hire_year, COUNT(*) as count FROM employees GROUP BY hire_year ORDER BY hire_year;", | |
| schema_description="employees(id, name, department, salary, hire_date β format YYYY-MM-DD)", | |
| ), | |
| SQLTask( | |
| task_id="easy_010", | |
| difficulty="easy", | |
| question="Find the minimum and maximum price for each product category.", | |
| schema_sql=""" | |
| CREATE TABLE products ( | |
| product_id INTEGER PRIMARY KEY, | |
| name TEXT, | |
| category TEXT, | |
| price REAL, | |
| stock INTEGER | |
| );""", | |
| seed_sql=""" | |
| INSERT INTO products VALUES | |
| (1,'Laptop','Electronics',999.99,50), | |
| (2,'Phone','Electronics',699.99,120), | |
| (3,'Desk','Furniture',249.99,30), | |
| (4,'Chair','Furniture',189.99,75), | |
| (5,'Tablet','Electronics',449.99,60), | |
| (6,'Bookshelf','Furniture',129.99,40), | |
| (7,'Headphones','Electronics',149.99,200), | |
| (8,'Lamp','Furniture',59.99,90), | |
| (9,'Monitor','Electronics',399.99,45), | |
| (10,'Keyboard','Electronics',79.99,150);""", | |
| solution_sql="SELECT category, MIN(price) as min_price, MAX(price) as max_price FROM products GROUP BY category ORDER BY category;", | |
| schema_description="products(product_id, name, category, price, stock)", | |
| ), | |
| ] | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # MEDIUM TASKS β JOINs, GROUP BY, HAVING | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| MEDIUM_SCHEMA_ECOMMERCE = """ | |
| CREATE TABLE customers ( | |
| customer_id INTEGER PRIMARY KEY, | |
| name TEXT, | |
| email TEXT, | |
| city TEXT, | |
| joined_date TEXT | |
| ); | |
| CREATE TABLE orders ( | |
| order_id INTEGER PRIMARY KEY, | |
| customer_id INTEGER, | |
| order_date TEXT, | |
| status TEXT, | |
| FOREIGN KEY (customer_id) REFERENCES customers(customer_id) | |
| ); | |
| CREATE TABLE order_items ( | |
| item_id INTEGER PRIMARY KEY, | |
| order_id INTEGER, | |
| product_name TEXT, | |
| quantity INTEGER, | |
| unit_price REAL, | |
| FOREIGN KEY (order_id) REFERENCES orders(order_id) | |
| );""" | |
| MEDIUM_SEED_ECOMMERCE = """ | |
| INSERT INTO customers VALUES | |
| (1,'Arjun Sharma','arjun@email.com','Mumbai','2022-01-15'), | |
| (2,'Priya Patel','priya@email.com','Delhi','2022-03-20'), | |
| (3,'Rahul Gupta','rahul@email.com','Bangalore','2021-11-05'), | |
| (4,'Sneha Reddy','sneha@email.com','Hyderabad','2023-02-10'), | |
| (5,'Vikram Singh','vikram@email.com','Chennai','2021-08-25'), | |
| (6,'Ananya Joshi','ananya@email.com','Pune','2022-07-14'), | |
| (7,'Karan Mehta','karan@email.com','Mumbai','2023-01-30'), | |
| (8,'Divya Nair','divya@email.com','Bangalore','2022-05-18'); | |
| INSERT INTO orders VALUES | |
| (101,1,'2024-01-10','completed'), | |
| (102,1,'2024-02-15','completed'), | |
| (103,2,'2024-01-20','completed'), | |
| (104,3,'2024-01-25','pending'), | |
| (105,3,'2024-02-28','completed'), | |
| (106,4,'2024-03-05','completed'), | |
| (107,5,'2024-01-08','cancelled'), | |
| (108,5,'2024-03-12','completed'), | |
| (109,6,'2024-02-20','completed'), | |
| (110,7,'2024-03-01','pending'), | |
| (111,8,'2024-01-30','completed'), | |
| (112,8,'2024-02-25','completed'); | |
| INSERT INTO order_items VALUES | |
| (1,101,'Laptop',1,999.99), | |
| (2,101,'Mouse',2,29.99), | |
| (3,102,'Phone',1,699.99), | |
| (4,103,'Tablet',2,449.99), | |
| (5,104,'Headphones',3,149.99), | |
| (6,105,'Monitor',1,399.99), | |
| (7,106,'Keyboard',2,79.99), | |
| (8,107,'Laptop',1,999.99), | |
| (9,108,'Phone',1,699.99), | |
| (10,109,'Desk',1,249.99), | |
| (11,110,'Chair',2,189.99), | |
| (12,111,'Laptop',1,999.99), | |
| (13,111,'Tablet',1,449.99), | |
| (14,112,'Headphones',2,149.99);""" | |
| MEDIUM_SCHEMA_HR = """ | |
| CREATE TABLE departments ( | |
| dept_id INTEGER PRIMARY KEY, | |
| dept_name TEXT, | |
| manager_id INTEGER, | |
| budget REAL | |
| ); | |
| CREATE TABLE employees ( | |
| emp_id INTEGER PRIMARY KEY, | |
| name TEXT, | |
| dept_id INTEGER, | |
| salary REAL, | |
| hire_date TEXT, | |
| manager_id INTEGER, | |
| FOREIGN KEY (dept_id) REFERENCES departments(dept_id) | |
| );""" | |
| MEDIUM_SEED_HR = """ | |
| INSERT INTO departments VALUES | |
| (1,'Engineering',5,5000000), | |
| (2,'Marketing',8,2000000), | |
| (3,'HR',12,1500000), | |
| (4,'Finance',15,3000000), | |
| (5,'Operations',18,2500000); | |
| INSERT INTO employees VALUES | |
| (1,'Alice Chen',1,95000,'2020-01-15',5), | |
| (2,'Bob Kumar',1,82000,'2021-06-01',5), | |
| (3,'Charlie Roy',1,78000,'2019-12-05',5), | |
| (4,'Diana Shah',2,72000,'2022-02-14',8), | |
| (5,'Eve Patel',1,110000,'2017-08-25',NULL), | |
| (6,'Frank Joshi',2,55000,'2019-03-20',8), | |
| (7,'Grace Singh',3,68000,'2020-09-30',12), | |
| (8,'Hank Mehta',2,88000,'2018-06-15',NULL), | |
| (9,'Ivy Reddy',3,52000,'2022-07-01',12), | |
| (10,'Jack Nair',3,47000,'2018-11-10',12), | |
| (11,'Kara Gupta',4,91000,'2019-04-22',15), | |
| (12,'Leo Sharma',3,75000,'2016-03-10',NULL), | |
| (13,'Mia Verma',4,83000,'2020-08-14',15), | |
| (14,'Nia Bose',5,69000,'2021-01-20',18), | |
| (15,'Omar Das',4,102000,'2015-11-30',NULL), | |
| (16,'Pia Iyer',5,61000,'2022-03-25',18), | |
| (17,'Quinn Rao',1,74000,'2021-09-08',5), | |
| (18,'Rita Pillai',5,95000,'2016-07-12',NULL);""" | |
| MEDIUM_TASKS = [ | |
| SQLTask( | |
| task_id="medium_001", | |
| difficulty="medium", | |
| question="Find customers who have placed more than 1 completed order. Show customer name and order count.", | |
| schema_sql=MEDIUM_SCHEMA_ECOMMERCE, | |
| seed_sql=MEDIUM_SEED_ECOMMERCE, | |
| solution_sql=""" | |
| SELECT c.name, COUNT(o.order_id) as order_count | |
| FROM customers c | |
| JOIN orders o ON c.customer_id = o.customer_id | |
| WHERE o.status = 'completed' | |
| GROUP BY c.customer_id, c.name | |
| HAVING COUNT(o.order_id) > 1 | |
| ORDER BY order_count DESC;""", | |
| schema_description="customers(customer_id, name, email, city, joined_date), orders(order_id, customer_id, order_date, status), order_items(item_id, order_id, product_name, quantity, unit_price)", | |
| ), | |
| SQLTask( | |
| task_id="medium_002", | |
| difficulty="medium", | |
| question="Calculate total spending per customer (only completed orders). Show customer name and total_spent, ordered by total_spent descending.", | |
| schema_sql=MEDIUM_SCHEMA_ECOMMERCE, | |
| seed_sql=MEDIUM_SEED_ECOMMERCE, | |
| solution_sql=""" | |
| SELECT c.name, ROUND(SUM(oi.quantity * oi.unit_price), 2) as total_spent | |
| FROM customers c | |
| JOIN orders o ON c.customer_id = o.customer_id | |
| JOIN order_items oi ON o.order_id = oi.order_id | |
| WHERE o.status = 'completed' | |
| GROUP BY c.customer_id, c.name | |
| ORDER BY total_spent DESC;""", | |
| schema_description="customers(customer_id, name, email, city, joined_date), orders(order_id, customer_id, order_date, status), order_items(item_id, order_id, product_name, quantity, unit_price)", | |
| ), | |
| SQLTask( | |
| task_id="medium_003", | |
| difficulty="medium", | |
| question="Find the most popular product (by total quantity sold across all completed orders).", | |
| schema_sql=MEDIUM_SCHEMA_ECOMMERCE, | |
| seed_sql=MEDIUM_SEED_ECOMMERCE, | |
| solution_sql=""" | |
| SELECT oi.product_name, SUM(oi.quantity) as total_sold | |
| FROM order_items oi | |
| JOIN orders o ON oi.order_id = o.order_id | |
| WHERE o.status = 'completed' | |
| GROUP BY oi.product_name | |
| ORDER BY total_sold DESC | |
| LIMIT 1;""", | |
| schema_description="customers(customer_id, name, email, city, joined_date), orders(order_id, customer_id, order_date, status), order_items(item_id, order_id, product_name, quantity, unit_price)", | |
| ), | |
| SQLTask( | |
| task_id="medium_004", | |
| difficulty="medium", | |
| question="Find departments where average salary exceeds 80000. Show dept_name and avg_salary.", | |
| schema_sql=MEDIUM_SCHEMA_HR, | |
| seed_sql=MEDIUM_SEED_HR, | |
| solution_sql=""" | |
| SELECT d.dept_name, ROUND(AVG(e.salary), 2) as avg_salary | |
| FROM departments d | |
| JOIN employees e ON d.dept_id = e.dept_id | |
| GROUP BY d.dept_id, d.dept_name | |
| HAVING AVG(e.salary) > 80000 | |
| ORDER BY avg_salary DESC;""", | |
| schema_description="departments(dept_id, dept_name, manager_id, budget), employees(emp_id, name, dept_id, salary, hire_date, manager_id)", | |
| ), | |
| SQLTask( | |
| task_id="medium_005", | |
| difficulty="medium", | |
| question="For each department, find the highest-paid employee. Show dept_name, employee name, and salary.", | |
| schema_sql=MEDIUM_SCHEMA_HR, | |
| seed_sql=MEDIUM_SEED_HR, | |
| solution_sql=""" | |
| SELECT d.dept_name, e.name, e.salary | |
| FROM employees e | |
| JOIN departments d ON e.dept_id = d.dept_id | |
| WHERE e.salary = ( | |
| SELECT MAX(e2.salary) FROM employees e2 WHERE e2.dept_id = e.dept_id | |
| ) | |
| ORDER BY e.salary DESC;""", | |
| schema_description="departments(dept_id, dept_name, manager_id, budget), employees(emp_id, name, dept_id, salary, hire_date, manager_id)", | |
| ), | |
| SQLTask( | |
| task_id="medium_006", | |
| difficulty="medium", | |
| question="List customers from Mumbai or Bangalore who have at least one order. Show name and city.", | |
| schema_sql=MEDIUM_SCHEMA_ECOMMERCE, | |
| seed_sql=MEDIUM_SEED_ECOMMERCE, | |
| solution_sql=""" | |
| SELECT DISTINCT c.name, c.city | |
| FROM customers c | |
| JOIN orders o ON c.customer_id = o.customer_id | |
| WHERE c.city IN ('Mumbai','Bangalore') | |
| ORDER BY c.city, c.name;""", | |
| schema_description="customers(customer_id, name, email, city, joined_date), orders(order_id, customer_id, order_date, status), order_items(item_id, order_id, product_name, quantity, unit_price)", | |
| ), | |
| SQLTask( | |
| task_id="medium_007", | |
| difficulty="medium", | |
| question="Find employees who earn more than their department's average salary. Show name, salary, and dept_name.", | |
| schema_sql=MEDIUM_SCHEMA_HR, | |
| seed_sql=MEDIUM_SEED_HR, | |
| solution_sql=""" | |
| SELECT e.name, e.salary, d.dept_name | |
| FROM employees e | |
| JOIN departments d ON e.dept_id = d.dept_id | |
| WHERE e.salary > ( | |
| SELECT AVG(e2.salary) FROM employees e2 WHERE e2.dept_id = e.dept_id | |
| ) | |
| ORDER BY d.dept_name, e.salary DESC;""", | |
| schema_description="departments(dept_id, dept_name, manager_id, budget), employees(emp_id, name, dept_id, salary, hire_date, manager_id)", | |
| ), | |
| SQLTask( | |
| task_id="medium_008", | |
| difficulty="medium", | |
| question="Calculate the cancellation rate per customer (cancelled orders / total orders). Show only customers with cancellation rate > 0. Show name and cancellation_rate rounded to 2 decimals.", | |
| schema_sql=MEDIUM_SCHEMA_ECOMMERCE, | |
| seed_sql=MEDIUM_SEED_ECOMMERCE, | |
| solution_sql=""" | |
| SELECT c.name, | |
| ROUND(CAST(SUM(CASE WHEN o.status='cancelled' THEN 1 ELSE 0 END) AS REAL) / COUNT(o.order_id), 2) as cancellation_rate | |
| FROM customers c | |
| JOIN orders o ON c.customer_id = o.customer_id | |
| GROUP BY c.customer_id, c.name | |
| HAVING SUM(CASE WHEN o.status='cancelled' THEN 1 ELSE 0 END) > 0 | |
| ORDER BY cancellation_rate DESC;""", | |
| schema_description="customers(customer_id, name, email, city, joined_date), orders(order_id, customer_id, order_date, status), order_items(item_id, order_id, product_name, quantity, unit_price)", | |
| ), | |
| SQLTask( | |
| task_id="medium_009", | |
| difficulty="medium", | |
| question="Find total salary cost per department and what percentage of total company salary each dept represents. Show dept_name, total_salary, salary_pct rounded to 1 decimal.", | |
| schema_sql=MEDIUM_SCHEMA_HR, | |
| seed_sql=MEDIUM_SEED_HR, | |
| solution_sql=""" | |
| SELECT d.dept_name, | |
| SUM(e.salary) as total_salary, | |
| ROUND(SUM(e.salary) * 100.0 / (SELECT SUM(salary) FROM employees), 1) as salary_pct | |
| FROM employees e | |
| JOIN departments d ON e.dept_id = d.dept_id | |
| GROUP BY d.dept_id, d.dept_name | |
| ORDER BY total_salary DESC;""", | |
| schema_description="departments(dept_id, dept_name, manager_id, budget), employees(emp_id, name, dept_id, salary, hire_date, manager_id)", | |
| ), | |
| SQLTask( | |
| task_id="medium_010", | |
| difficulty="medium", | |
| question="Find orders placed in January 2024. Show customer name, order_date, and order total (sum of quantity*unit_price).", | |
| schema_sql=MEDIUM_SCHEMA_ECOMMERCE, | |
| seed_sql=MEDIUM_SEED_ECOMMERCE, | |
| solution_sql=""" | |
| SELECT c.name, o.order_date, ROUND(SUM(oi.quantity * oi.unit_price),2) as order_total | |
| FROM orders o | |
| JOIN customers c ON o.customer_id = c.customer_id | |
| JOIN order_items oi ON o.order_id = oi.order_id | |
| WHERE o.order_date LIKE '2024-01%' | |
| GROUP BY o.order_id, c.name, o.order_date | |
| ORDER BY o.order_date;""", | |
| schema_description="customers(customer_id, name, email, city, joined_date), orders(order_id, customer_id, order_date, status), order_items(item_id, order_id, product_name, quantity, unit_price)", | |
| ), | |
| SQLTask( | |
| task_id="medium_011", | |
| difficulty="medium", | |
| question="Which product has generated the most revenue across all completed orders? Show product_name and total_revenue.", | |
| schema_sql=MEDIUM_SCHEMA_ECOMMERCE, | |
| seed_sql=MEDIUM_SEED_ECOMMERCE, | |
| solution_sql=""" | |
| SELECT oi.product_name, ROUND(SUM(oi.quantity * oi.unit_price),2) as total_revenue | |
| FROM order_items oi | |
| JOIN orders o ON oi.order_id = o.order_id | |
| WHERE o.status = 'completed' | |
| GROUP BY oi.product_name | |
| ORDER BY total_revenue DESC | |
| LIMIT 1;""", | |
| schema_description="customers(customer_id, name, email, city, joined_date), orders(order_id, customer_id, order_date, status), order_items(item_id, order_id, product_name, quantity, unit_price)", | |
| ), | |
| SQLTask( | |
| task_id="medium_012", | |
| difficulty="medium", | |
| question="Find employees who have been at the company for more than 4 years (hired before 2021). Show name, hire_date, and dept_name.", | |
| schema_sql=MEDIUM_SCHEMA_HR, | |
| seed_sql=MEDIUM_SEED_HR, | |
| solution_sql=""" | |
| SELECT e.name, e.hire_date, d.dept_name | |
| FROM employees e | |
| JOIN departments d ON e.dept_id = d.dept_id | |
| WHERE e.hire_date < '2021-01-01' | |
| ORDER BY e.hire_date;""", | |
| schema_description="departments(dept_id, dept_name, manager_id, budget), employees(emp_id, name, dept_id, salary, hire_date, manager_id)", | |
| ), | |
| SQLTask( | |
| task_id="medium_013", | |
| difficulty="medium", | |
| question="Find the second highest salary in the company (single value, call it second_highest_salary).", | |
| schema_sql=MEDIUM_SCHEMA_HR, | |
| seed_sql=MEDIUM_SEED_HR, | |
| solution_sql=""" | |
| SELECT MAX(salary) as second_highest_salary | |
| FROM employees | |
| WHERE salary < (SELECT MAX(salary) FROM employees);""", | |
| schema_description="departments(dept_id, dept_name, manager_id, budget), employees(emp_id, name, dept_id, salary, hire_date, manager_id)", | |
| ), | |
| SQLTask( | |
| task_id="medium_014", | |
| difficulty="medium", | |
| question="Count the number of employees per manager (only show managers with 2+ direct reports). Show manager name and report_count.", | |
| schema_sql=MEDIUM_SCHEMA_HR, | |
| seed_sql=MEDIUM_SEED_HR, | |
| solution_sql=""" | |
| SELECT m.name as manager_name, COUNT(e.emp_id) as report_count | |
| FROM employees e | |
| JOIN employees m ON e.manager_id = m.emp_id | |
| GROUP BY m.emp_id, m.name | |
| HAVING COUNT(e.emp_id) >= 2 | |
| ORDER BY report_count DESC;""", | |
| schema_description="departments(dept_id, dept_name, manager_id, budget), employees(emp_id, name, dept_id, salary, hire_date, manager_id) β manager_id refers to emp_id of the manager", | |
| ), | |
| SQLTask( | |
| task_id="medium_015", | |
| difficulty="medium", | |
| question="Find customers who have never placed an order. Show their name and email.", | |
| schema_sql=MEDIUM_SCHEMA_ECOMMERCE, | |
| seed_sql=MEDIUM_SEED_ECOMMERCE, | |
| solution_sql=""" | |
| SELECT c.name, c.email | |
| FROM customers c | |
| LEFT JOIN orders o ON c.customer_id = o.customer_id | |
| WHERE o.order_id IS NULL | |
| ORDER BY c.name;""", | |
| schema_description="customers(customer_id, name, email, city, joined_date), orders(order_id, customer_id, order_date, status), order_items(item_id, order_id, product_name, quantity, unit_price)", | |
| ), | |
| ] | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # HARD TASKS β CTEs, Window Functions | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| HARD_SCHEMA = """ | |
| CREATE TABLE customers ( | |
| customer_id INTEGER PRIMARY KEY, | |
| name TEXT, city TEXT, segment TEXT | |
| ); | |
| CREATE TABLE orders ( | |
| order_id INTEGER PRIMARY KEY, | |
| customer_id INTEGER, | |
| order_date TEXT, | |
| ship_date TEXT, | |
| category TEXT, | |
| amount REAL | |
| ); | |
| CREATE TABLE returns ( | |
| return_id INTEGER PRIMARY KEY, | |
| order_id INTEGER, | |
| return_date TEXT, | |
| reason TEXT | |
| );""" | |
| HARD_SEED = """ | |
| INSERT INTO customers VALUES | |
| (1,'Arjun','Mumbai','Consumer'), | |
| (2,'Priya','Delhi','Corporate'), | |
| (3,'Rahul','Bangalore','Consumer'), | |
| (4,'Sneha','Hyderabad','Corporate'), | |
| (5,'Vikram','Chennai','Home Office'), | |
| (6,'Ananya','Pune','Consumer'), | |
| (7,'Karan','Mumbai','Corporate'), | |
| (8,'Divya','Bangalore','Consumer'); | |
| INSERT INTO orders VALUES | |
| (1001,1,'2023-01-10','2023-01-15','Technology',2500.00), | |
| (1002,1,'2023-03-20','2023-03-25','Furniture',1200.00), | |
| (1003,1,'2023-06-15','2023-06-20','Technology',3800.00), | |
| (1004,2,'2023-02-05','2023-02-10','Office Supplies',450.00), | |
| (1005,2,'2023-04-18','2023-04-23','Technology',5200.00), | |
| (1006,2,'2023-07-30','2023-08-04','Furniture',890.00), | |
| (1007,3,'2023-01-25','2023-01-30','Technology',1100.00), | |
| (1008,3,'2023-05-12','2023-05-17','Office Supplies',320.00), | |
| (1009,4,'2023-03-08','2023-03-13','Furniture',2200.00), | |
| (1010,4,'2023-08-22','2023-08-27','Technology',4100.00), | |
| (1011,5,'2023-02-14','2023-02-19','Office Supplies',180.00), | |
| (1012,5,'2023-06-28','2023-07-03','Technology',2900.00), | |
| (1013,6,'2023-01-05','2023-01-10','Furniture',750.00), | |
| (1014,6,'2023-04-25','2023-04-30','Office Supplies',290.00), | |
| (1015,7,'2023-03-15','2023-03-20','Technology',6800.00), | |
| (1016,7,'2023-07-10','2023-07-15','Furniture',1500.00), | |
| (1017,8,'2023-02-28','2023-03-05','Technology',3200.00), | |
| (1018,8,'2023-05-20','2023-05-25','Office Supplies',410.00); | |
| INSERT INTO returns VALUES | |
| (1,1002,'2023-04-01','Defective'), | |
| (2,1004,'2023-02-20','Wrong item'), | |
| (3,1008,'2023-05-25','Not needed'), | |
| (4,1011,'2023-02-25','Defective'), | |
| (5,1013,'2023-01-20','Wrong item');""" | |
| HARD_TASKS = [ | |
| SQLTask( | |
| task_id="hard_001", | |
| difficulty="hard", | |
| question="Using a window function, rank customers by their total order amount within each city segment. Show customer name, city, total_amount, and their rank within city.", | |
| schema_sql=HARD_SCHEMA, | |
| seed_sql=HARD_SEED, | |
| solution_sql=""" | |
| SELECT c.name, c.city, | |
| SUM(o.amount) as total_amount, | |
| RANK() OVER (PARTITION BY c.city ORDER BY SUM(o.amount) DESC) as city_rank | |
| FROM customers c | |
| JOIN orders o ON c.customer_id = o.customer_id | |
| GROUP BY c.customer_id, c.name, c.city | |
| ORDER BY c.city, city_rank;""", | |
| schema_description="customers(customer_id, name, city, segment), orders(order_id, customer_id, order_date, ship_date, category, amount), returns(return_id, order_id, return_date, reason)", | |
| ), | |
| SQLTask( | |
| task_id="hard_002", | |
| difficulty="hard", | |
| question="Using a CTE, find the running total of order amounts for each customer ordered by order_date. Show customer name, order_date, amount, and running_total.", | |
| schema_sql=HARD_SCHEMA, | |
| seed_sql=HARD_SEED, | |
| solution_sql=""" | |
| WITH customer_orders AS ( | |
| SELECT c.name, o.order_date, o.amount, | |
| SUM(o.amount) OVER (PARTITION BY c.customer_id ORDER BY o.order_date) as running_total | |
| FROM customers c | |
| JOIN orders o ON c.customer_id = o.customer_id | |
| ) | |
| SELECT name, order_date, amount, running_total | |
| FROM customer_orders | |
| ORDER BY name, order_date;""", | |
| schema_description="customers(customer_id, name, city, segment), orders(order_id, customer_id, order_date, ship_date, category, amount), returns(return_id, order_id, return_date, reason)", | |
| ), | |
| SQLTask( | |
| task_id="hard_003", | |
| difficulty="hard", | |
| question="Find the return rate by category (returned orders / total orders). Show category and return_rate as a percentage rounded to 1 decimal.", | |
| schema_sql=HARD_SCHEMA, | |
| seed_sql=HARD_SEED, | |
| solution_sql=""" | |
| SELECT o.category, | |
| ROUND(COUNT(r.return_id) * 100.0 / COUNT(o.order_id), 1) as return_rate | |
| FROM orders o | |
| LEFT JOIN returns r ON o.order_id = r.order_id | |
| GROUP BY o.category | |
| ORDER BY return_rate DESC;""", | |
| schema_description="customers(customer_id, name, city, segment), orders(order_id, customer_id, order_date, ship_date, category, amount), returns(return_id, order_id, return_date, reason)", | |
| ), | |
| SQLTask( | |
| task_id="hard_004", | |
| difficulty="hard", | |
| question="For each customer, find the month with their highest single order amount. Show customer name, best_month (YYYY-MM format), and max_amount.", | |
| schema_sql=HARD_SCHEMA, | |
| seed_sql=HARD_SEED, | |
| solution_sql=""" | |
| WITH monthly_max AS ( | |
| SELECT c.name, SUBSTR(o.order_date,1,7) as month, MAX(o.amount) as max_amount, | |
| RANK() OVER (PARTITION BY c.customer_id ORDER BY MAX(o.amount) DESC) as rnk | |
| FROM customers c | |
| JOIN orders o ON c.customer_id = o.customer_id | |
| GROUP BY c.customer_id, c.name, month | |
| ) | |
| SELECT name, month as best_month, max_amount | |
| FROM monthly_max | |
| WHERE rnk = 1 | |
| ORDER BY max_amount DESC;""", | |
| schema_description="customers(customer_id, name, city, segment), orders(order_id, customer_id, order_date, ship_date, category, amount), returns(return_id, order_id, return_date, reason)", | |
| ), | |
| SQLTask( | |
| task_id="hard_005", | |
| difficulty="hard", | |
| question="Calculate average shipping time (days between order_date and ship_date) per category. Show category and avg_ship_days rounded to 1 decimal.", | |
| schema_sql=HARD_SCHEMA, | |
| seed_sql=HARD_SEED, | |
| solution_sql=""" | |
| SELECT category, | |
| ROUND(AVG(julianday(ship_date) - julianday(order_date)), 1) as avg_ship_days | |
| FROM orders | |
| GROUP BY category | |
| ORDER BY avg_ship_days;""", | |
| schema_description="customers(customer_id, name, city, segment), orders(order_id, customer_id, order_date, ship_date, category, amount), returns(return_id, order_id, return_date, reason)", | |
| ), | |
| SQLTask( | |
| task_id="hard_006", | |
| difficulty="hard", | |
| question="Find customers whose total order value in Technology exceeds their total in all other categories combined. Show customer name, tech_total, and other_total.", | |
| schema_sql=HARD_SCHEMA, | |
| seed_sql=HARD_SEED, | |
| solution_sql=""" | |
| WITH category_totals AS ( | |
| SELECT c.customer_id, c.name, | |
| SUM(CASE WHEN o.category='Technology' THEN o.amount ELSE 0 END) as tech_total, | |
| SUM(CASE WHEN o.category!='Technology' THEN o.amount ELSE 0 END) as other_total | |
| FROM customers c | |
| JOIN orders o ON c.customer_id = o.customer_id | |
| GROUP BY c.customer_id, c.name | |
| ) | |
| SELECT name, tech_total, other_total | |
| FROM category_totals | |
| WHERE tech_total > other_total | |
| ORDER BY tech_total DESC;""", | |
| schema_description="customers(customer_id, name, city, segment), orders(order_id, customer_id, order_date, ship_date, category, amount), returns(return_id, order_id, return_date, reason)", | |
| ), | |
| SQLTask( | |
| task_id="hard_007", | |
| difficulty="hard", | |
| question="Using LAG window function, for each customer show each order amount and the difference from their previous order amount. Show name, order_date, amount, prev_amount, and amount_change.", | |
| schema_sql=HARD_SCHEMA, | |
| seed_sql=HARD_SEED, | |
| solution_sql=""" | |
| SELECT c.name, o.order_date, o.amount, | |
| LAG(o.amount) OVER (PARTITION BY c.customer_id ORDER BY o.order_date) as prev_amount, | |
| o.amount - LAG(o.amount) OVER (PARTITION BY c.customer_id ORDER BY o.order_date) as amount_change | |
| FROM customers c | |
| JOIN orders o ON c.customer_id = o.customer_id | |
| ORDER BY c.name, o.order_date;""", | |
| schema_description="customers(customer_id, name, city, segment), orders(order_id, customer_id, order_date, ship_date, category, amount), returns(return_id, order_id, return_date, reason)", | |
| ), | |
| SQLTask( | |
| task_id="hard_008", | |
| difficulty="hard", | |
| question="Identify the top 2 customers by total order amount in each customer segment. Show segment, customer name, total_amount, and rank within segment.", | |
| schema_sql=HARD_SCHEMA, | |
| seed_sql=HARD_SEED, | |
| solution_sql=""" | |
| WITH ranked AS ( | |
| SELECT c.segment, c.name, | |
| SUM(o.amount) as total_amount, | |
| RANK() OVER (PARTITION BY c.segment ORDER BY SUM(o.amount) DESC) as rnk | |
| FROM customers c | |
| JOIN orders o ON c.customer_id = o.customer_id | |
| GROUP BY c.customer_id, c.segment, c.name | |
| ) | |
| SELECT segment, name, total_amount, rnk | |
| FROM ranked | |
| WHERE rnk <= 2 | |
| ORDER BY segment, rnk;""", | |
| schema_description="customers(customer_id, name, city, segment), orders(order_id, customer_id, order_date, ship_date, category, amount), returns(return_id, order_id, return_date, reason)", | |
| ), | |
| SQLTask( | |
| task_id="hard_009", | |
| difficulty="hard", | |
| question="Find the percentage of total revenue contributed by each customer. Show name and revenue_pct rounded to 2 decimals, ordered by pct descending.", | |
| schema_sql=HARD_SCHEMA, | |
| seed_sql=HARD_SEED, | |
| solution_sql=""" | |
| SELECT c.name, | |
| ROUND(SUM(o.amount) * 100.0 / (SELECT SUM(amount) FROM orders), 2) as revenue_pct | |
| FROM customers c | |
| JOIN orders o ON c.customer_id = o.customer_id | |
| GROUP BY c.customer_id, c.name | |
| ORDER BY revenue_pct DESC;""", | |
| schema_description="customers(customer_id, name, city, segment), orders(order_id, customer_id, order_date, ship_date, category, amount), returns(return_id, order_id, return_date, reason)", | |
| ), | |
| SQLTask( | |
| task_id="hard_010", | |
| difficulty="hard", | |
| question="For each category, show total revenue, average order size, and count of unique customers who ordered in that category.", | |
| schema_sql=HARD_SCHEMA, | |
| seed_sql=HARD_SEED, | |
| solution_sql=""" | |
| SELECT o.category, | |
| ROUND(SUM(o.amount), 2) as total_revenue, | |
| ROUND(AVG(o.amount), 2) as avg_order_size, | |
| COUNT(DISTINCT o.customer_id) as unique_customers | |
| FROM orders o | |
| GROUP BY o.category | |
| ORDER BY total_revenue DESC;""", | |
| schema_description="customers(customer_id, name, city, segment), orders(order_id, customer_id, order_date, ship_date, category, amount), returns(return_id, order_id, return_date, reason)", | |
| ), | |
| SQLTask( | |
| task_id="hard_011", | |
| difficulty="hard", | |
| question="Find customers who have ordered in ALL three categories (Technology, Furniture, Office Supplies). Show customer name.", | |
| schema_sql=HARD_SCHEMA, | |
| seed_sql=HARD_SEED, | |
| solution_sql=""" | |
| SELECT c.name | |
| FROM customers c | |
| JOIN orders o ON c.customer_id = o.customer_id | |
| WHERE o.category IN ('Technology','Furniture','Office Supplies') | |
| GROUP BY c.customer_id, c.name | |
| HAVING COUNT(DISTINCT o.category) = 3 | |
| ORDER BY c.name;""", | |
| schema_description="customers(customer_id, name, city, segment), orders(order_id, customer_id, order_date, ship_date, category, amount), returns(return_id, order_id, return_date, reason)", | |
| ), | |
| SQLTask( | |
| task_id="hard_012", | |
| difficulty="hard", | |
| question="For each order, show the order amount as a percentile rank (0 to 1) among all orders. Show order_id, amount, and percentile_rank rounded to 2 decimals.", | |
| schema_sql=HARD_SCHEMA, | |
| seed_sql=HARD_SEED, | |
| solution_sql=""" | |
| SELECT order_id, amount, | |
| ROUND(PERCENT_RANK() OVER (ORDER BY amount), 2) as percentile_rank | |
| FROM orders | |
| ORDER BY amount;""", | |
| schema_description="customers(customer_id, name, city, segment), orders(order_id, customer_id, order_date, ship_date, category, amount), returns(return_id, order_id, return_date, reason)", | |
| ), | |
| SQLTask( | |
| task_id="hard_013", | |
| difficulty="hard", | |
| question="Find orders where the shipping time was above the average shipping time for their category. Show order_id, category, ship_days, and category_avg_days (both rounded to 1 decimal).", | |
| schema_sql=HARD_SCHEMA, | |
| seed_sql=HARD_SEED, | |
| solution_sql=""" | |
| WITH ship_times AS ( | |
| SELECT order_id, category, | |
| ROUND(julianday(ship_date) - julianday(order_date), 1) as ship_days, | |
| ROUND(AVG(julianday(ship_date) - julianday(order_date)) OVER (PARTITION BY category), 1) as category_avg_days | |
| FROM orders | |
| ) | |
| SELECT order_id, category, ship_days, category_avg_days | |
| FROM ship_times | |
| WHERE ship_days > category_avg_days | |
| ORDER BY ship_days DESC;""", | |
| schema_description="customers(customer_id, name, city, segment), orders(order_id, customer_id, order_date, ship_date, category, amount), returns(return_id, order_id, return_date, reason)", | |
| ), | |
| SQLTask( | |
| task_id="hard_014", | |
| difficulty="hard", | |
| question="Find the first and last order date for each customer, and number of days between them (customer tenure in days). Show name, first_order, last_order, tenure_days.", | |
| schema_sql=HARD_SCHEMA, | |
| seed_sql=HARD_SEED, | |
| solution_sql=""" | |
| SELECT c.name, | |
| MIN(o.order_date) as first_order, | |
| MAX(o.order_date) as last_order, | |
| CAST(julianday(MAX(o.order_date)) - julianday(MIN(o.order_date)) AS INTEGER) as tenure_days | |
| FROM customers c | |
| JOIN orders o ON c.customer_id = o.customer_id | |
| GROUP BY c.customer_id, c.name | |
| ORDER BY tenure_days DESC;""", | |
| schema_description="customers(customer_id, name, city, segment), orders(order_id, customer_id, order_date, ship_date, category, amount), returns(return_id, order_id, return_date, reason)", | |
| ), | |
| SQLTask( | |
| task_id="hard_015", | |
| difficulty="hard", | |
| question="Using a CTE, find customers with above-average order count AND above-average total spend. Show name, order_count, and total_spend.", | |
| schema_sql=HARD_SCHEMA, | |
| seed_sql=HARD_SEED, | |
| solution_sql=""" | |
| WITH customer_stats AS ( | |
| SELECT c.customer_id, c.name, | |
| COUNT(o.order_id) as order_count, | |
| SUM(o.amount) as total_spend | |
| FROM customers c | |
| JOIN orders o ON c.customer_id = o.customer_id | |
| GROUP BY c.customer_id, c.name | |
| ), | |
| averages AS ( | |
| SELECT AVG(order_count) as avg_orders, AVG(total_spend) as avg_spend | |
| FROM customer_stats | |
| ) | |
| SELECT cs.name, cs.order_count, cs.total_spend | |
| FROM customer_stats cs, averages | |
| WHERE cs.order_count > averages.avg_orders | |
| AND cs.total_spend > averages.avg_spend | |
| ORDER BY cs.total_spend DESC;""", | |
| schema_description="customers(customer_id, name, city, segment), orders(order_id, customer_id, order_date, ship_date, category, amount), returns(return_id, order_id, return_date, reason)", | |
| ), | |
| ] | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # EXPERT TASKS β Correlated subqueries, complex analytics | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| EXPERT_SCHEMA = """ | |
| CREATE TABLE accounts ( | |
| account_id INTEGER PRIMARY KEY, | |
| holder_name TEXT, | |
| account_type TEXT, | |
| city TEXT, | |
| opened_date TEXT, | |
| balance REAL | |
| ); | |
| CREATE TABLE transactions ( | |
| txn_id INTEGER PRIMARY KEY, | |
| account_id INTEGER, | |
| txn_date TEXT, | |
| txn_type TEXT, | |
| amount REAL, | |
| category TEXT, | |
| FOREIGN KEY (account_id) REFERENCES accounts(account_id) | |
| ); | |
| CREATE TABLE loans ( | |
| loan_id INTEGER PRIMARY KEY, | |
| account_id INTEGER, | |
| loan_type TEXT, | |
| principal REAL, | |
| interest_rate REAL, | |
| start_date TEXT, | |
| status TEXT, | |
| FOREIGN KEY (account_id) REFERENCES accounts(account_id) | |
| );""" | |
| EXPERT_SEED = """ | |
| INSERT INTO accounts VALUES | |
| (1,'Arjun Sharma','Savings','Mumbai','2018-03-15',125000.00), | |
| (2,'Priya Patel','Current','Delhi','2016-07-22',450000.00), | |
| (3,'Rahul Gupta','Savings','Bangalore','2020-01-10',78000.00), | |
| (4,'Sneha Reddy','Current','Hyderabad','2015-11-30',890000.00), | |
| (5,'Vikram Singh','Savings','Chennai','2019-05-14',42000.00), | |
| (6,'Ananya Joshi','Savings','Pune','2021-08-09',195000.00), | |
| (7,'Karan Mehta','Current','Mumbai','2017-02-28',320000.00), | |
| (8,'Divya Nair','Savings','Bangalore','2022-04-17',55000.00); | |
| INSERT INTO transactions VALUES | |
| (1,1,'2024-01-05','credit',50000,'salary'), | |
| (2,1,'2024-01-12','debit',15000,'rent'), | |
| (3,1,'2024-01-20','debit',8000,'groceries'), | |
| (4,1,'2024-02-05','credit',50000,'salary'), | |
| (5,1,'2024-02-18','debit',22000,'utilities'), | |
| (6,2,'2024-01-08','credit',200000,'business'), | |
| (7,2,'2024-01-15','debit',80000,'supplier'), | |
| (8,2,'2024-01-28','debit',45000,'rent'), | |
| (9,2,'2024-02-10','credit',175000,'business'), | |
| (10,2,'2024-02-22','debit',60000,'supplier'), | |
| (11,3,'2024-01-10','credit',45000,'salary'), | |
| (12,3,'2024-01-25','debit',12000,'rent'), | |
| (13,3,'2024-02-10','credit',45000,'salary'), | |
| (14,3,'2024-02-20','debit',18000,'utilities'), | |
| (15,4,'2024-01-03','credit',500000,'business'), | |
| (16,4,'2024-01-20','debit',200000,'investment'), | |
| (17,4,'2024-02-05','credit',480000,'business'), | |
| (18,4,'2024-02-25','debit',150000,'supplier'), | |
| (19,5,'2024-01-15','credit',38000,'salary'), | |
| (20,5,'2024-01-30','debit',15000,'rent'), | |
| (21,6,'2024-01-08','credit',85000,'salary'), | |
| (22,6,'2024-01-22','debit',25000,'rent'), | |
| (23,6,'2024-02-08','credit',85000,'salary'), | |
| (24,6,'2024-02-20','debit',30000,'investment'), | |
| (25,7,'2024-01-12','credit',150000,'business'), | |
| (26,7,'2024-01-28','debit',75000,'supplier'), | |
| (27,7,'2024-02-15','credit',160000,'business'), | |
| (28,8,'2024-01-18','credit',52000,'salary'), | |
| (29,8,'2024-02-18','credit',52000,'salary'), | |
| (30,8,'2024-02-25','debit',20000,'rent'); | |
| INSERT INTO loans VALUES | |
| (1,1,'Home Loan',2500000,8.5,'2020-06-15','active'), | |
| (2,2,'Business Loan',5000000,10.2,'2019-03-20','active'), | |
| (3,3,'Personal Loan',150000,14.5,'2022-08-10','active'), | |
| (4,5,'Vehicle Loan',800000,9.8,'2021-12-05','active'), | |
| (5,6,'Home Loan',3500000,8.2,'2022-09-15','active'), | |
| (6,7,'Business Loan',2000000,11.5,'2018-04-22','closed'), | |
| (7,8,'Personal Loan',100000,15.2,'2023-01-30','active');""" | |
| EXPERT_TASKS = [ | |
| SQLTask( | |
| task_id="expert_001", | |
| difficulty="expert", | |
| question="For each account, calculate the net cash flow (total credits minus total debits) per month in 2024. Show holder_name, month (YYYY-MM), and net_flow. Order by holder_name, month.", | |
| schema_sql=EXPERT_SCHEMA, | |
| seed_sql=EXPERT_SEED, | |
| solution_sql=""" | |
| SELECT a.holder_name, | |
| SUBSTR(t.txn_date,1,7) as month, | |
| ROUND(SUM(CASE WHEN t.txn_type='credit' THEN t.amount ELSE -t.amount END), 2) as net_flow | |
| FROM accounts a | |
| JOIN transactions t ON a.account_id = t.account_id | |
| WHERE t.txn_date LIKE '2024%' | |
| GROUP BY a.account_id, a.holder_name, month | |
| ORDER BY a.holder_name, month;""", | |
| schema_description="accounts(account_id, holder_name, account_type, city, opened_date, balance), transactions(txn_id, account_id, txn_date, txn_type[credit/debit], amount, category), loans(loan_id, account_id, loan_type, principal, interest_rate, start_date, status)", | |
| ), | |
| SQLTask( | |
| task_id="expert_002", | |
| difficulty="expert", | |
| question="Find accounts where the total loan principal exceeds 5x their current account balance. Show holder_name, balance, total_loan_principal, and the ratio (rounded to 2 decimals).", | |
| schema_sql=EXPERT_SCHEMA, | |
| seed_sql=EXPERT_SEED, | |
| solution_sql=""" | |
| SELECT a.holder_name, a.balance, | |
| SUM(l.principal) as total_loan_principal, | |
| ROUND(SUM(l.principal) / a.balance, 2) as debt_to_balance_ratio | |
| FROM accounts a | |
| JOIN loans l ON a.account_id = l.account_id | |
| WHERE l.status = 'active' | |
| GROUP BY a.account_id, a.holder_name, a.balance | |
| HAVING SUM(l.principal) > 5 * a.balance | |
| ORDER BY debt_to_balance_ratio DESC;""", | |
| schema_description="accounts(account_id, holder_name, account_type, city, opened_date, balance), transactions(txn_id, account_id, txn_date, txn_type[credit/debit], amount, category), loans(loan_id, account_id, loan_type, principal, interest_rate, start_date, status)", | |
| ), | |
| SQLTask( | |
| task_id="expert_003", | |
| difficulty="expert", | |
| question="Using a CTE chain, find the top spending category per account (by total debit amount). Show holder_name, top_category, and category_spend.", | |
| schema_sql=EXPERT_SCHEMA, | |
| seed_sql=EXPERT_SEED, | |
| solution_sql=""" | |
| WITH category_spend AS ( | |
| SELECT a.account_id, a.holder_name, t.category, | |
| SUM(t.amount) as spend, | |
| RANK() OVER (PARTITION BY a.account_id ORDER BY SUM(t.amount) DESC) as rnk | |
| FROM accounts a | |
| JOIN transactions t ON a.account_id = t.account_id | |
| WHERE t.txn_type = 'debit' | |
| GROUP BY a.account_id, a.holder_name, t.category | |
| ) | |
| SELECT holder_name, category as top_category, spend as category_spend | |
| FROM category_spend | |
| WHERE rnk = 1 | |
| ORDER BY category_spend DESC;""", | |
| schema_description="accounts(account_id, holder_name, account_type, city, opened_date, balance), transactions(txn_id, account_id, txn_date, txn_type[credit/debit], amount, category), loans(loan_id, account_id, loan_type, principal, interest_rate, start_date, status)", | |
| ), | |
| SQLTask( | |
| task_id="expert_004", | |
| difficulty="expert", | |
| question="Calculate the estimated annual interest cost for each active loan. Show holder_name, loan_type, principal, interest_rate, and annual_interest_cost (principal * interest_rate / 100), ordered by annual_interest_cost descending.", | |
| schema_sql=EXPERT_SCHEMA, | |
| seed_sql=EXPERT_SEED, | |
| solution_sql=""" | |
| SELECT a.holder_name, l.loan_type, l.principal, l.interest_rate, | |
| ROUND(l.principal * l.interest_rate / 100, 2) as annual_interest_cost | |
| FROM accounts a | |
| JOIN loans l ON a.account_id = l.account_id | |
| WHERE l.status = 'active' | |
| ORDER BY annual_interest_cost DESC;""", | |
| schema_description="accounts(account_id, holder_name, account_type, city, opened_date, balance), transactions(txn_id, account_id, txn_date, txn_type[credit/debit], amount, category), loans(loan_id, account_id, loan_type, principal, interest_rate, start_date, status)", | |
| ), | |
| SQLTask( | |
| task_id="expert_005", | |
| difficulty="expert", | |
| question="For each city, calculate the average account balance, total transaction volume, and number of active loans. Show city, avg_balance, total_txn_volume, and active_loan_count.", | |
| schema_sql=EXPERT_SCHEMA, | |
| seed_sql=EXPERT_SEED, | |
| solution_sql=""" | |
| SELECT a.city, | |
| ROUND(AVG(a.balance), 2) as avg_balance, | |
| ROUND(SUM(t.amount), 2) as total_txn_volume, | |
| COUNT(DISTINCT l.loan_id) as active_loan_count | |
| FROM accounts a | |
| LEFT JOIN transactions t ON a.account_id = t.account_id | |
| LEFT JOIN loans l ON a.account_id = l.account_id AND l.status = 'active' | |
| GROUP BY a.city | |
| ORDER BY avg_balance DESC;""", | |
| schema_description="accounts(account_id, holder_name, account_type, city, opened_date, balance), transactions(txn_id, account_id, txn_date, txn_type[credit/debit], amount, category), loans(loan_id, account_id, loan_type, principal, interest_rate, start_date, status)", | |
| ), | |
| SQLTask( | |
| task_id="expert_006", | |
| difficulty="expert", | |
| question="Find accounts that received salary credits in both January and February 2024. Show holder_name.", | |
| schema_sql=EXPERT_SCHEMA, | |
| seed_sql=EXPERT_SEED, | |
| solution_sql=""" | |
| SELECT a.holder_name | |
| FROM accounts a | |
| WHERE EXISTS ( | |
| SELECT 1 FROM transactions t | |
| WHERE t.account_id = a.account_id | |
| AND t.txn_type = 'credit' | |
| AND t.category = 'salary' | |
| AND t.txn_date LIKE '2024-01%' | |
| ) | |
| AND EXISTS ( | |
| SELECT 1 FROM transactions t | |
| WHERE t.account_id = a.account_id | |
| AND t.txn_type = 'credit' | |
| AND t.category = 'salary' | |
| AND t.txn_date LIKE '2024-02%' | |
| ) | |
| ORDER BY a.holder_name;""", | |
| schema_description="accounts(account_id, holder_name, account_type, city, opened_date, balance), transactions(txn_id, account_id, txn_date, txn_type[credit/debit], amount, category), loans(loan_id, account_id, loan_type, principal, interest_rate, start_date, status)", | |
| ), | |
| SQLTask( | |
| task_id="expert_007", | |
| difficulty="expert", | |
| question="Calculate savings rate for each salary earner: (total_credits - total_debits) / total_credits. Show holder_name and savings_rate as percentage rounded to 1 decimal. Only include accounts with salary transactions.", | |
| schema_sql=EXPERT_SCHEMA, | |
| seed_sql=EXPERT_SEED, | |
| solution_sql=""" | |
| WITH flows AS ( | |
| SELECT a.account_id, a.holder_name, | |
| SUM(CASE WHEN t.txn_type='credit' THEN t.amount ELSE 0 END) as total_credits, | |
| SUM(CASE WHEN t.txn_type='debit' THEN t.amount ELSE 0 END) as total_debits | |
| FROM accounts a | |
| JOIN transactions t ON a.account_id = t.account_id | |
| GROUP BY a.account_id, a.holder_name | |
| ) | |
| SELECT f.holder_name, | |
| ROUND((f.total_credits - f.total_debits) * 100.0 / f.total_credits, 1) as savings_rate | |
| FROM flows f | |
| WHERE f.account_id IN ( | |
| SELECT DISTINCT account_id FROM transactions WHERE category = 'salary' | |
| ) | |
| ORDER BY savings_rate DESC;""", | |
| schema_description="accounts(account_id, holder_name, account_type, city, opened_date, balance), transactions(txn_id, account_id, txn_date, txn_type[credit/debit], amount, category), loans(loan_id, account_id, loan_type, principal, interest_rate, start_date, status)", | |
| ), | |
| SQLTask( | |
| task_id="expert_008", | |
| difficulty="expert", | |
| question="Find the account with the highest transaction frequency (most transactions per month on average). Show holder_name, total_transactions, months_active (distinct months with transactions), and avg_txn_per_month rounded to 2 decimals.", | |
| schema_sql=EXPERT_SCHEMA, | |
| seed_sql=EXPERT_SEED, | |
| solution_sql=""" | |
| SELECT a.holder_name, | |
| COUNT(t.txn_id) as total_transactions, | |
| COUNT(DISTINCT SUBSTR(t.txn_date,1,7)) as months_active, | |
| ROUND(CAST(COUNT(t.txn_id) AS REAL) / COUNT(DISTINCT SUBSTR(t.txn_date,1,7)), 2) as avg_txn_per_month | |
| FROM accounts a | |
| JOIN transactions t ON a.account_id = t.account_id | |
| GROUP BY a.account_id, a.holder_name | |
| ORDER BY avg_txn_per_month DESC | |
| LIMIT 1;""", | |
| schema_description="accounts(account_id, holder_name, account_type, city, opened_date, balance), transactions(txn_id, account_id, txn_date, txn_type[credit/debit], amount, category), loans(loan_id, account_id, loan_type, principal, interest_rate, start_date, status)", | |
| ), | |
| SQLTask( | |
| task_id="expert_009", | |
| difficulty="expert", | |
| question="Identify accounts that have both an active loan AND whose total debits in 2024 exceed their total credits in 2024 (cash flow negative). Show holder_name, net_flow_2024, and total_active_loan_principal.", | |
| schema_sql=EXPERT_SCHEMA, | |
| seed_sql=EXPERT_SEED, | |
| solution_sql=""" | |
| WITH cash_flow AS ( | |
| SELECT account_id, | |
| SUM(CASE WHEN txn_type='credit' THEN amount ELSE -amount END) as net_flow | |
| FROM transactions | |
| WHERE txn_date LIKE '2024%' | |
| GROUP BY account_id | |
| ), | |
| active_loans AS ( | |
| SELECT account_id, SUM(principal) as total_principal | |
| FROM loans WHERE status='active' | |
| GROUP BY account_id | |
| ) | |
| SELECT a.holder_name, ROUND(cf.net_flow, 2) as net_flow_2024, al.total_principal | |
| FROM accounts a | |
| JOIN cash_flow cf ON a.account_id = cf.account_id | |
| JOIN active_loans al ON a.account_id = al.account_id | |
| WHERE cf.net_flow < 0 | |
| ORDER BY cf.net_flow;""", | |
| schema_description="accounts(account_id, holder_name, account_type, city, opened_date, balance), transactions(txn_id, account_id, txn_date, txn_type[credit/debit], amount, category), loans(loan_id, account_id, loan_type, principal, interest_rate, start_date, status)", | |
| ), | |
| SQLTask( | |
| task_id="expert_010", | |
| difficulty="expert", | |
| question="Create a financial health score for each account: score = (balance/100000)*40 + (net_credits_2024/50000)*40 - (active_loan_principal/1000000)*20, capped between 0 and 100. Show holder_name and health_score rounded to 1 decimal, ordered by score descending.", | |
| schema_sql=EXPERT_SCHEMA, | |
| seed_sql=EXPERT_SEED, | |
| solution_sql=""" | |
| WITH metrics AS ( | |
| SELECT a.account_id, a.holder_name, a.balance, | |
| COALESCE(SUM(CASE WHEN t.txn_type='credit' THEN t.amount ELSE 0 END), 0) as net_credits, | |
| COALESCE((SELECT SUM(principal) FROM loans l WHERE l.account_id=a.account_id AND l.status='active'), 0) as loan_principal | |
| FROM accounts a | |
| LEFT JOIN transactions t ON a.account_id = t.account_id AND t.txn_date LIKE '2024%' | |
| GROUP BY a.account_id, a.holder_name, a.balance | |
| ) | |
| SELECT holder_name, | |
| ROUND(MIN(100, MAX(0, | |
| (balance/100000.0)*40 + | |
| (net_credits/50000.0)*40 - | |
| (loan_principal/1000000.0)*20 | |
| )), 1) as health_score | |
| FROM metrics | |
| ORDER BY health_score DESC;""", | |
| schema_description="accounts(account_id, holder_name, account_type, city, opened_date, balance), transactions(txn_id, account_id, txn_date, txn_type[credit/debit], amount, category), loans(loan_id, account_id, loan_type, principal, interest_rate, start_date, status)", | |
| ), | |
| ] | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # Task registry | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| ALL_TASKS: Dict[str, SQLTask] = {} | |
| for _t in EASY_TASKS + MEDIUM_TASKS + HARD_TASKS + EXPERT_TASKS: | |
| ALL_TASKS[_t.task_id] = _t | |
| TASKS_BY_DIFFICULTY: Dict[str, List[SQLTask]] = { | |
| "easy": EASY_TASKS, | |
| "medium": MEDIUM_TASKS, | |
| "hard": HARD_TASKS, | |
| "expert": EXPERT_TASKS, | |
| } | |
| def get_task(task_id: str) -> Optional[SQLTask]: | |
| return ALL_TASKS.get(task_id) | |
| def get_tasks_by_difficulty(difficulty: str) -> List[SQLTask]: | |
| return TASKS_BY_DIFFICULTY.get(difficulty, []) |