Spaces:
Running
Running
File size: 49,444 Bytes
a39d8ef | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 | """
data_factory/templates.py
==========================
Human-authored, execution-verified SQL templates across 4 domains Γ 3 difficulty tiers.
CRITICAL DESIGN PRINCIPLE:
SQL is NEVER generated by an LLM in this pipeline.
Every SQL here was written by hand and verified by running it against
seeded SQLite data. Zero errors guaranteed.
Structure per entry:
{
"domain": str, # ecommerce | healthcare | finance | hr
"difficulty": str, # easy | medium | hard
"sql": str, # verified ground-truth SQL
"description": str, # one-line English summary (seed for NL generation)
"base_nl": str, # canonical natural-language question
"has_order": bool, # True β comparison is order-sensitive
}
"""
from __future__ import annotations
from typing import TypedDict
class Template(TypedDict):
domain: str
difficulty: str
sql: str
description: str
base_nl: str
has_order: bool
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# DOMAIN: ECOMMERCE
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
ECOMMERCE_TEMPLATES: list[Template] = [
# ββ EASY ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
{
"domain": "ecommerce", "difficulty": "easy", "has_order": True,
"description": "List gold-tier customers sorted alphabetically with id, name, email, country",
"base_nl": "List all gold-tier customers ordered by name alphabetically. Return id, name, email, country.",
"sql": (
"SELECT id, name, email, country "
"FROM customers "
"WHERE tier = 'gold' "
"ORDER BY name ASC"
),
},
{
"domain": "ecommerce", "difficulty": "easy", "has_order": True,
"description": "Products priced above $100, sorted by price descending",
"base_nl": "Show all products with a price above $100, sorted from highest to lowest price. Return id, name, price.",
"sql": (
"SELECT id, name, price "
"FROM products "
"WHERE price > 100 "
"ORDER BY price DESC"
),
},
{
"domain": "ecommerce", "difficulty": "easy", "has_order": True,
"description": "Delivered orders with total_amount > 200, sorted by amount descending",
"base_nl": "Find all delivered orders with a total amount greater than $200, sorted by total amount descending. Return id, customer_id, total_amount, created_at.",
"sql": (
"SELECT id, customer_id, total_amount, created_at "
"FROM orders "
"WHERE status = 'delivered' "
" AND total_amount > 200 "
"ORDER BY total_amount DESC"
),
},
{
"domain": "ecommerce", "difficulty": "easy", "has_order": True,
"description": "Top 5 most expensive products",
"base_nl": "Return the top 5 most expensive products. Return id, name, price.",
"sql": (
"SELECT id, name, price "
"FROM products "
"ORDER BY price DESC "
"LIMIT 5"
),
},
{
"domain": "ecommerce", "difficulty": "easy", "has_order": True,
"description": "Distinct countries where customers come from, sorted alphabetically",
"base_nl": "List all distinct countries our customers come from, sorted alphabetically. Return country.",
"sql": (
"SELECT DISTINCT country "
"FROM customers "
"ORDER BY country ASC"
),
},
{
"domain": "ecommerce", "difficulty": "easy", "has_order": False,
"description": "Count total number of customers",
"base_nl": "How many customers do we have in total? Return a single column total_customers.",
"sql": "SELECT COUNT(*) AS total_customers FROM customers",
},
{
"domain": "ecommerce", "difficulty": "easy", "has_order": True,
"description": "Products with zero stock",
"base_nl": "List all out-of-stock products. Return id, name, stock_quantity.",
"sql": (
"SELECT id, name, stock_quantity "
"FROM products "
"WHERE stock_quantity = 0 "
"ORDER BY name ASC"
),
},
{
"domain": "ecommerce", "difficulty": "easy", "has_order": True,
"description": "Customers from India sorted by name",
"base_nl": "Show all customers from India, sorted by name. Return id, name, email.",
"sql": (
"SELECT id, name, email "
"FROM customers "
"WHERE country = 'India' "
"ORDER BY name ASC"
),
},
{
"domain": "ecommerce", "difficulty": "easy", "has_order": True,
"description": "Products in a price range of $20 to $100 sorted by price ascending",
"base_nl": "Which products are priced between $20 and $100? Sort by price ascending. Return id, name, price.",
"sql": (
"SELECT id, name, price "
"FROM products "
"WHERE price BETWEEN 20 AND 100 "
"ORDER BY price ASC"
),
},
{
"domain": "ecommerce", "difficulty": "easy", "has_order": False,
"description": "Count orders by status",
"base_nl": "How many orders are there for each status? Return status, order_count.",
"sql": (
"SELECT status, COUNT(*) AS order_count "
"FROM orders "
"GROUP BY status"
),
},
# ββ MEDIUM βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
{
"domain": "ecommerce", "difficulty": "medium", "has_order": True,
"description": "Order count per customer including those with zero orders, sorted by count desc",
"base_nl": "How many orders has each customer placed? Include customers with zero orders. Return customer_name, order_count, sorted by order_count descending then customer_name ascending.",
"sql": (
"SELECT c.name AS customer_name, COUNT(o.id) AS order_count "
"FROM customers c "
"LEFT JOIN orders o ON c.id = o.customer_id "
"GROUP BY c.id, c.name "
"ORDER BY order_count DESC, customer_name ASC"
),
},
{
"domain": "ecommerce", "difficulty": "medium", "has_order": True,
"description": "Average product rating per category sorted descending",
"base_nl": "What is the average product rating per category? Only include categories with at least one review. Return category_name, avg_rating (rounded to 2 decimal places), sorted by avg_rating descending.",
"sql": (
"SELECT c.name AS category_name, "
" ROUND(AVG(r.rating), 2) AS avg_rating "
"FROM categories c "
"JOIN products p ON p.category_id = c.id "
"JOIN reviews r ON r.product_id = p.id "
"GROUP BY c.id, c.name "
"ORDER BY avg_rating DESC"
),
},
{
"domain": "ecommerce", "difficulty": "medium", "has_order": True,
"description": "Customers who spent more than $500 on delivered orders",
"base_nl": "Which customers have spent more than $500 total on delivered orders? Return customer_name, total_spent (rounded to 2 decimal places), sorted by total_spent descending.",
"sql": (
"SELECT c.name AS customer_name, "
" ROUND(SUM(o.total_amount), 2) AS total_spent "
"FROM customers c "
"JOIN orders o ON o.customer_id = c.id "
"WHERE o.status = 'delivered' "
"GROUP BY c.id, c.name "
"HAVING SUM(o.total_amount) > 500 "
"ORDER BY total_spent DESC"
),
},
{
"domain": "ecommerce", "difficulty": "medium", "has_order": True,
"description": "Total quantity sold per product sorted descending",
"base_nl": "Show the total quantity sold for each product that appears in at least one order. Return product_name, total_quantity_sold, sorted by total_quantity_sold descending.",
"sql": (
"SELECT p.name AS product_name, "
" SUM(oi.quantity) AS total_quantity_sold "
"FROM products p "
"JOIN order_items oi ON oi.product_id = p.id "
"GROUP BY p.id, p.name "
"ORDER BY total_quantity_sold DESC"
),
},
{
"domain": "ecommerce", "difficulty": "medium", "has_order": True,
"description": "Product count and average price per category sorted by count desc",
"base_nl": "For each category, show the number of products and their average price. Return category_name, product_count, avg_price (rounded to 2 decimal places), sorted by product_count descending.",
"sql": (
"SELECT cat.name AS category_name, "
" COUNT(p.id) AS product_count, "
" ROUND(AVG(p.price), 2) AS avg_price "
"FROM categories cat "
"JOIN products p ON p.category_id = cat.id "
"GROUP BY cat.id, cat.name "
"ORDER BY product_count DESC"
),
},
{
"domain": "ecommerce", "difficulty": "medium", "has_order": True,
"description": "Categories with more than 5 in-stock products sorted by count desc",
"base_nl": "Which categories have more than 5 products in stock (stock_quantity > 0)? Return category_name, in_stock_count, sorted by in_stock_count descending.",
"sql": (
"SELECT c.name AS category_name, "
" COUNT(p.id) AS in_stock_count "
"FROM categories c "
"JOIN products p ON p.category_id = c.id "
"WHERE p.stock_quantity > 0 "
"GROUP BY c.id, c.name "
"HAVING COUNT(p.id) > 5 "
"ORDER BY in_stock_count DESC"
),
},
{
"domain": "ecommerce", "difficulty": "medium", "has_order": True,
"description": "Total revenue per product from order items, sorted descending",
"base_nl": "What is the total revenue generated by each product from order items? Return product_name, total_revenue (rounded to 2 decimal places), sorted by total_revenue descending.",
"sql": (
"SELECT p.name AS product_name, "
" ROUND(SUM(oi.quantity * oi.unit_price), 2) AS total_revenue "
"FROM products p "
"JOIN order_items oi ON oi.product_id = p.id "
"GROUP BY p.id, p.name "
"ORDER BY total_revenue DESC"
),
},
# ββ HARD βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
{
"domain": "ecommerce", "difficulty": "hard", "has_order": True,
"description": "Customer spending rank using DENSE_RANK on delivered orders",
"base_nl": "Rank customers by total spending on delivered orders using DENSE_RANK (rank 1 = highest spender). Return customer_name, total_spent (rounded to 2 decimal places), spending_rank, sorted by spending_rank ascending.",
"sql": (
"SELECT customer_name, total_spent, spending_rank "
"FROM ( "
" SELECT c.name AS customer_name, "
" ROUND(SUM(o.total_amount), 2) AS total_spent, "
" DENSE_RANK() OVER (ORDER BY SUM(o.total_amount) DESC) AS spending_rank "
" FROM customers c "
" JOIN orders o ON o.customer_id = c.id "
" WHERE o.status = 'delivered' "
" GROUP BY c.id, c.name "
") sub "
"ORDER BY spending_rank ASC"
),
},
{
"domain": "ecommerce", "difficulty": "hard", "has_order": True,
"description": "Monthly delivered revenue with running total using window SUM",
"base_nl": "Show the monthly revenue from delivered orders and its running cumulative total. Return month (YYYY-MM), monthly_revenue, running_total (both rounded to 2 decimal places), sorted by month ascending.",
"sql": (
"WITH monthly AS ( "
" SELECT strftime('%Y-%m', created_at) AS month, "
" ROUND(SUM(total_amount), 2) AS monthly_revenue "
" FROM orders "
" WHERE status = 'delivered' "
" GROUP BY strftime('%Y-%m', created_at) "
") "
"SELECT month, "
" monthly_revenue, "
" ROUND(SUM(monthly_revenue) OVER (ORDER BY month), 2) AS running_total "
"FROM monthly "
"ORDER BY month ASC"
),
},
{
"domain": "ecommerce", "difficulty": "hard", "has_order": True,
"description": "Customers whose most recent order was cancelled, using ROW_NUMBER CTE",
"base_nl": "Find all customers whose most recent order has status 'cancelled'. Use ROW_NUMBER to identify the latest order per customer. Return customer_name, last_order_status, last_order_date, sorted by customer_name ascending.",
"sql": (
"WITH ranked_orders AS ( "
" SELECT customer_id, status, created_at, "
" ROW_NUMBER() OVER (PARTITION BY customer_id "
" ORDER BY created_at DESC) AS rn "
" FROM orders "
") "
"SELECT c.name AS customer_name, "
" ro.status AS last_order_status, "
" ro.created_at AS last_order_date "
"FROM customers c "
"JOIN ranked_orders ro ON ro.customer_id = c.id "
"WHERE ro.rn = 1 "
" AND ro.status = 'cancelled' "
"ORDER BY customer_name ASC"
),
},
{
"domain": "ecommerce", "difficulty": "hard", "has_order": True,
"description": "Products above their category average rating, using two CTEs",
"base_nl": "Find products whose average rating is strictly above the average rating of all products in their category. Return product_name, category_name, product_avg_rating, category_avg_rating (both rounded to 2 decimal places), sorted by product_avg_rating descending then product_name ascending.",
"sql": (
"WITH product_ratings AS ( "
" SELECT p.id AS product_id, p.name AS product_name, "
" p.category_id, c.name AS category_name, "
" ROUND(AVG(r.rating), 2) AS product_avg_rating "
" FROM products p "
" JOIN reviews r ON r.product_id = p.id "
" JOIN categories c ON c.id = p.category_id "
" GROUP BY p.id, p.name, p.category_id, c.name "
"), "
"category_ratings AS ( "
" SELECT category_id, "
" ROUND(AVG(product_avg_rating), 2) AS category_avg_rating "
" FROM product_ratings "
" GROUP BY category_id "
") "
"SELECT pr.product_name, pr.category_name, "
" pr.product_avg_rating, cr.category_avg_rating "
"FROM product_ratings pr "
"JOIN category_ratings cr ON cr.category_id = pr.category_id "
"WHERE pr.product_avg_rating > cr.category_avg_rating "
"ORDER BY pr.product_avg_rating DESC, pr.product_name ASC"
),
},
]
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# DOMAIN: HEALTHCARE
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
HEALTHCARE_TEMPLATES: list[Template] = [
# ββ EASY ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
{
"domain": "healthcare", "difficulty": "easy", "has_order": True,
"description": "Doctors sorted by consultation fee descending",
"base_nl": "List all doctors sorted by consultation fee from highest to lowest. Return id, name, specialization, consultation_fee.",
"sql": (
"SELECT id, name, specialization, consultation_fee "
"FROM doctors "
"ORDER BY consultation_fee DESC"
),
},
{
"domain": "healthcare", "difficulty": "easy", "has_order": True,
"description": "Doctors with more than 10 years experience sorted desc",
"base_nl": "Show doctors with more than 10 years of experience, sorted by experience descending. Return id, name, specialization, experience_years.",
"sql": (
"SELECT id, name, specialization, experience_years "
"FROM doctors "
"WHERE experience_years > 10 "
"ORDER BY experience_years DESC"
),
},
{
"domain": "healthcare", "difficulty": "easy", "has_order": True,
"description": "Patients from India sorted by name",
"base_nl": "List all patients from India sorted alphabetically by name. Return id, name, country, blood_type.",
"sql": (
"SELECT id, name, country, blood_type "
"FROM patients "
"WHERE country = 'India' "
"ORDER BY name ASC"
),
},
{
"domain": "healthcare", "difficulty": "easy", "has_order": True,
"description": "Medications with unit price under $0.20 sorted ascending",
"base_nl": "Which medications cost less than $0.20 per unit? Sort by price ascending. Return id, name, category, unit_price.",
"sql": (
"SELECT id, name, category, unit_price "
"FROM medications "
"WHERE unit_price < 0.20 "
"ORDER BY unit_price ASC"
),
},
{
"domain": "healthcare", "difficulty": "easy", "has_order": True,
"description": "Top 5 most expensive medications",
"base_nl": "What are the top 5 most expensive medications? Return id, name, unit_price.",
"sql": (
"SELECT id, name, unit_price "
"FROM medications "
"ORDER BY unit_price DESC "
"LIMIT 5"
),
},
{
"domain": "healthcare", "difficulty": "easy", "has_order": False,
"description": "Count of completed appointments",
"base_nl": "How many appointments have been completed? Return a single value total_completed.",
"sql": (
"SELECT COUNT(*) AS total_completed "
"FROM appointments "
"WHERE status = 'completed'"
),
},
{
"domain": "healthcare", "difficulty": "easy", "has_order": True,
"description": "Severe diagnoses sorted by ICD code",
"base_nl": "List all severe diagnoses sorted by ICD code. Return id, icd_code, description, severity.",
"sql": (
"SELECT id, icd_code, description, severity "
"FROM diagnoses "
"WHERE severity = 'severe' "
"ORDER BY icd_code ASC"
),
},
{
"domain": "healthcare", "difficulty": "easy", "has_order": False,
"description": "Count patients by gender",
"base_nl": "How many patients are there by gender? Return gender, patient_count.",
"sql": (
"SELECT gender, COUNT(*) AS patient_count "
"FROM patients "
"GROUP BY gender"
),
},
# ββ MEDIUM βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
{
"domain": "healthcare", "difficulty": "medium", "has_order": True,
"description": "Appointment count per doctor including those with no appointments",
"base_nl": "How many appointments has each doctor had (including those with none)? Return doctor_name, appointment_count, sorted by appointment_count descending.",
"sql": (
"SELECT d.name AS doctor_name, COUNT(a.id) AS appointment_count "
"FROM doctors d "
"LEFT JOIN appointments a ON a.doctor_id = d.id "
"GROUP BY d.id, d.name "
"ORDER BY appointment_count DESC"
),
},
{
"domain": "healthcare", "difficulty": "medium", "has_order": True,
"description": "Most prescribed medications by count",
"base_nl": "Which medications are prescribed most often? Return medication_name, category, times_prescribed, sorted by times_prescribed descending.",
"sql": (
"SELECT m.name AS medication_name, m.category, COUNT(p.id) AS times_prescribed "
"FROM medications m "
"JOIN prescriptions p ON p.medication_id = m.id "
"GROUP BY m.id, m.name, m.category "
"ORDER BY times_prescribed DESC"
),
},
{
"domain": "healthcare", "difficulty": "medium", "has_order": True,
"description": "Patients with more than one completed visit",
"base_nl": "Which patients have had more than one completed appointment? Return patient_name, visit_count, sorted by visit_count descending.",
"sql": (
"SELECT pat.name AS patient_name, COUNT(DISTINCT a.id) AS visit_count "
"FROM patients pat "
"JOIN appointments a ON a.patient_id = pat.id "
"WHERE a.status = 'completed' "
"GROUP BY pat.id, pat.name "
"HAVING COUNT(DISTINCT a.id) > 1 "
"ORDER BY visit_count DESC"
),
},
{
"domain": "healthcare", "difficulty": "medium", "has_order": True,
"description": "Estimated revenue per doctor from completed appointments",
"base_nl": "What is the estimated total revenue per doctor from completed appointments (based on consultation fee)? Return doctor_name, specialization, estimated_revenue (rounded to 2 decimal places), sorted by estimated_revenue descending.",
"sql": (
"SELECT d.name AS doctor_name, d.specialization, "
" ROUND(SUM(d.consultation_fee), 2) AS estimated_revenue "
"FROM doctors d "
"JOIN appointments a ON a.doctor_id = d.id "
"WHERE a.status = 'completed' "
"GROUP BY d.id, d.name, d.specialization "
"ORDER BY estimated_revenue DESC"
),
},
{
"domain": "healthcare", "difficulty": "medium", "has_order": True,
"description": "Diagnosis count per severity level",
"base_nl": "How many diagnoses are there per severity level? Return severity, diagnosis_count, sorted by diagnosis_count descending.",
"sql": (
"SELECT severity, COUNT(*) AS diagnosis_count "
"FROM diagnoses "
"GROUP BY severity "
"ORDER BY diagnosis_count DESC"
),
},
# ββ HARD βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
{
"domain": "healthcare", "difficulty": "hard", "has_order": True,
"description": "Doctors ranked by appointment count within specialization using RANK",
"base_nl": "Rank doctors by appointment count within their specialization (rank 1 = most appointments). Return doctor_name, specialization, appointment_count, rank_in_spec, sorted by specialization then rank_in_spec ascending.",
"sql": (
"SELECT doctor_name, specialization, appointment_count, "
" RANK() OVER (PARTITION BY specialization ORDER BY appointment_count DESC) AS rank_in_spec "
"FROM ( "
" SELECT d.name AS doctor_name, d.specialization, COUNT(a.id) AS appointment_count "
" FROM doctors d "
" JOIN appointments a ON a.doctor_id = d.id "
" GROUP BY d.id, d.name, d.specialization "
") sub "
"ORDER BY specialization, rank_in_spec"
),
},
{
"domain": "healthcare", "difficulty": "hard", "has_order": True,
"description": "Top 10 patients by total completed visits using CTE",
"base_nl": "Find the top 10 patients by number of completed appointments. Return patient_name, total_visits, last_visit, sorted by total_visits descending.",
"sql": (
"WITH patient_visits AS ( "
" SELECT a.patient_id, COUNT(a.id) AS total_visits, "
" MAX(a.scheduled_at) AS last_visit "
" FROM appointments a "
" WHERE a.status = 'completed' "
" GROUP BY a.patient_id "
") "
"SELECT p.name AS patient_name, pv.total_visits, pv.last_visit "
"FROM patients p "
"JOIN patient_visits pv ON pv.patient_id = p.id "
"ORDER BY pv.total_visits DESC "
"LIMIT 10"
),
},
{
"domain": "healthcare", "difficulty": "hard", "has_order": True,
"description": "Medications total prescription cost per category using window SUM",
"base_nl": "For each medication, show its total prescription cost (unit_price Γ quantity) and the running total of cost within its category. Return medication_name, category, total_cost, category_running_cost (both rounded to 2 decimal places), sorted by category then total_cost descending.",
"sql": (
"WITH med_costs AS ( "
" SELECT m.name AS medication_name, m.category, "
" ROUND(SUM(m.unit_price * pr.quantity), 2) AS total_cost "
" FROM medications m "
" JOIN prescriptions pr ON pr.medication_id = m.id "
" GROUP BY m.id, m.name, m.category "
") "
"SELECT medication_name, category, total_cost, "
" ROUND(SUM(total_cost) OVER (PARTITION BY category ORDER BY total_cost DESC), 2) "
" AS category_running_cost "
"FROM med_costs "
"ORDER BY category, total_cost DESC"
),
},
]
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# DOMAIN: FINANCE
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
FINANCE_TEMPLATES: list[Template] = [
# ββ EASY ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
{
"domain": "finance", "difficulty": "easy", "has_order": True,
"description": "Verified KYC customers sorted by name",
"base_nl": "List all customers with verified KYC status, sorted alphabetically. Return id, name, country, kyc_status.",
"sql": (
"SELECT id, name, country, kyc_status "
"FROM fin_customers "
"WHERE kyc_status = 'verified' "
"ORDER BY name ASC"
),
},
{
"domain": "finance", "difficulty": "easy", "has_order": True,
"description": "Accounts with balance over $10,000 sorted by balance descending",
"base_nl": "Which accounts have a balance greater than $10,000? Return id, customer_id, account_type, balance, sorted by balance descending.",
"sql": (
"SELECT id, customer_id, account_type, balance "
"FROM accounts "
"WHERE balance > 10000 "
"ORDER BY balance DESC"
),
},
{
"domain": "finance", "difficulty": "easy", "has_order": True,
"description": "Large credit transactions above $1,000 sorted by amount descending",
"base_nl": "Show all credit transactions with an amount greater than $1,000. Return id, account_id, txn_type, amount, created_at, sorted by amount descending.",
"sql": (
"SELECT id, account_id, txn_type, amount, created_at "
"FROM transactions "
"WHERE txn_type = 'credit' AND amount > 1000 "
"ORDER BY amount DESC"
),
},
{
"domain": "finance", "difficulty": "easy", "has_order": True,
"description": "Defaulted loans sorted by principal amount descending",
"base_nl": "List all defaulted loans, sorted by principal amount descending. Return id, loan_type, principal_amount, interest_rate, status.",
"sql": (
"SELECT id, loan_type, principal_amount, interest_rate, status "
"FROM loans "
"WHERE status = 'defaulted' "
"ORDER BY principal_amount DESC"
),
},
{
"domain": "finance", "difficulty": "easy", "has_order": False,
"description": "Count of late loan payments",
"base_nl": "How many loan payments were made late? Return a single value late_payments.",
"sql": "SELECT COUNT(*) AS late_payments FROM loan_payments WHERE is_late = 1",
},
{
"domain": "finance", "difficulty": "easy", "has_order": True,
"description": "Top 5 highest principal loans",
"base_nl": "What are the top 5 loans by principal amount? Return id, customer_id, loan_type, principal_amount.",
"sql": (
"SELECT id, customer_id, loan_type, principal_amount "
"FROM loans "
"ORDER BY principal_amount DESC "
"LIMIT 5"
),
},
{
"domain": "finance", "difficulty": "easy", "has_order": False,
"description": "Count of accounts by account type",
"base_nl": "How many accounts exist for each account type? Return account_type, account_count.",
"sql": (
"SELECT account_type, COUNT(*) AS account_count "
"FROM accounts "
"GROUP BY account_type"
),
},
# ββ MEDIUM βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
{
"domain": "finance", "difficulty": "medium", "has_order": True,
"description": "Total active account balance per customer sorted by balance descending",
"base_nl": "What is the total active account balance per customer? Return customer_name, account_count, total_balance (rounded to 2 decimal places), sorted by total_balance descending.",
"sql": (
"SELECT fc.name AS customer_name, COUNT(a.id) AS account_count, "
" ROUND(SUM(a.balance), 2) AS total_balance "
"FROM fin_customers fc "
"JOIN accounts a ON a.customer_id = fc.id "
"WHERE a.status = 'active' "
"GROUP BY fc.id, fc.name "
"ORDER BY total_balance DESC"
),
},
{
"domain": "finance", "difficulty": "medium", "has_order": True,
"description": "Total credit transaction amount by account type",
"base_nl": "What is the total credit amount per account type? Return account_type, total_credits (rounded to 2 decimal places), sorted by total_credits descending.",
"sql": (
"SELECT a.account_type, ROUND(SUM(t.amount), 2) AS total_credits "
"FROM accounts a "
"JOIN transactions t ON t.account_id = a.id "
"WHERE t.txn_type = 'credit' "
"GROUP BY a.account_type "
"ORDER BY total_credits DESC"
),
},
{
"domain": "finance", "difficulty": "medium", "has_order": True,
"description": "Total loan borrowing per customer sorted descending",
"base_nl": "How much has each customer borrowed in total across all loans? Return customer_name, loan_count, total_borrowed (rounded to 2 decimal places), sorted by total_borrowed descending.",
"sql": (
"SELECT fc.name AS customer_name, COUNT(l.id) AS loan_count, "
" ROUND(SUM(l.principal_amount), 2) AS total_borrowed "
"FROM fin_customers fc "
"JOIN loans l ON l.customer_id = fc.id "
"GROUP BY fc.id, fc.name "
"ORDER BY total_borrowed DESC"
),
},
{
"domain": "finance", "difficulty": "medium", "has_order": True,
"description": "Late payment count and total amount by loan type",
"base_nl": "For each loan type, how many late payments were there and what was the total amount paid late? Return loan_type, late_payments, total_late_paid (rounded to 2 decimal places), sorted by late_payments descending.",
"sql": (
"SELECT l.loan_type, COUNT(lp.id) AS late_payments, "
" ROUND(SUM(lp.amount_paid), 2) AS total_late_paid "
"FROM loans l "
"JOIN loan_payments lp ON lp.loan_id = l.id "
"WHERE lp.is_late = 1 "
"GROUP BY l.loan_type "
"ORDER BY late_payments DESC"
),
},
# ββ HARD βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
{
"domain": "finance", "difficulty": "hard", "has_order": True,
"description": "Customer balance rank using DENSE_RANK on active accounts",
"base_nl": "Rank customers by their total active account balance using DENSE_RANK. Return customer_name, total_balance, balance_rank, sorted by balance_rank ascending.",
"sql": (
"SELECT customer_name, total_balance, "
" DENSE_RANK() OVER (ORDER BY total_balance DESC) AS balance_rank "
"FROM ( "
" SELECT fc.name AS customer_name, "
" ROUND(SUM(a.balance), 2) AS total_balance "
" FROM fin_customers fc "
" JOIN accounts a ON a.customer_id = fc.id "
" WHERE a.status = 'active' "
" GROUP BY fc.id, fc.name "
") sub "
"ORDER BY balance_rank"
),
},
{
"domain": "finance", "difficulty": "hard", "has_order": True,
"description": "Monthly transaction totals by type with running total using window SUM",
"base_nl": "Show monthly transaction totals per type (credit/debit) with a running cumulative total. Return month (YYYY-MM), txn_type, total, running_total (rounded to 2 decimal places), sorted by month then txn_type.",
"sql": (
"WITH monthly_txn AS ( "
" SELECT strftime('%Y-%m', created_at) AS month, "
" txn_type, "
" ROUND(SUM(amount), 2) AS total "
" FROM transactions "
" GROUP BY strftime('%Y-%m', created_at), txn_type "
") "
"SELECT month, txn_type, total, "
" ROUND(SUM(total) OVER (PARTITION BY txn_type ORDER BY month), 2) AS running_total "
"FROM monthly_txn "
"ORDER BY month, txn_type"
),
},
{
"domain": "finance", "difficulty": "hard", "has_order": True,
"description": "Customers with only defaulted loans using NOT EXISTS",
"base_nl": "Find customers who have at least one loan and ALL their loans are defaulted. Return customer_name, loan_count, sorted by customer_name ascending.",
"sql": (
"SELECT fc.name AS customer_name, COUNT(l.id) AS loan_count "
"FROM fin_customers fc "
"JOIN loans l ON l.customer_id = fc.id "
"GROUP BY fc.id, fc.name "
"HAVING COUNT(l.id) > 0 "
" AND SUM(CASE WHEN l.status != 'defaulted' THEN 1 ELSE 0 END) = 0 "
"ORDER BY customer_name ASC"
),
},
]
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# DOMAIN: HR
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
HR_TEMPLATES: list[Template] = [
# ββ EASY ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
{
"domain": "hr", "difficulty": "easy", "has_order": True,
"description": "Active employees sorted by salary descending",
"base_nl": "List all active employees sorted by salary from highest to lowest. Return id, name, job_title, salary.",
"sql": (
"SELECT id, name, job_title, salary "
"FROM employees "
"WHERE status = 'active' "
"ORDER BY salary DESC"
),
},
{
"domain": "hr", "difficulty": "easy", "has_order": True,
"description": "Departments sorted by budget descending",
"base_nl": "Show all departments sorted by budget from largest to smallest. Return id, name, location, budget.",
"sql": (
"SELECT id, name, location, budget "
"FROM departments "
"ORDER BY budget DESC"
),
},
{
"domain": "hr", "difficulty": "easy", "has_order": True,
"description": "Employees hired in 2023 or later sorted by hire date descending",
"base_nl": "Which employees were hired on or after January 1st 2023? Sort by hire date descending. Return id, name, job_title, hire_date.",
"sql": (
"SELECT id, name, job_title, hire_date "
"FROM employees "
"WHERE hire_date >= '2023-01-01' "
"ORDER BY hire_date DESC"
),
},
{
"domain": "hr", "difficulty": "easy", "has_order": True,
"description": "Active projects sorted by budget descending",
"base_nl": "Show all currently active projects sorted by budget descending. Return id, name, status, budget.",
"sql": (
"SELECT id, name, status, budget "
"FROM projects "
"WHERE status = 'active' "
"ORDER BY budget DESC"
),
},
{
"domain": "hr", "difficulty": "easy", "has_order": True,
"description": "Active employees earning above $100,000 sorted by salary descending",
"base_nl": "Which active employees earn more than $100,000? Return id, name, email, job_title, sorted by salary descending.",
"sql": (
"SELECT id, name, email, job_title "
"FROM employees "
"WHERE status = 'active' AND salary > 100000 "
"ORDER BY salary DESC"
),
},
{
"domain": "hr", "difficulty": "easy", "has_order": False,
"description": "Count of active employees",
"base_nl": "How many active employees do we currently have? Return active_employees.",
"sql": "SELECT COUNT(*) AS active_employees FROM employees WHERE status = 'active'",
},
{
"domain": "hr", "difficulty": "easy", "has_order": True,
"description": "Projects with no end date (ongoing) sorted by budget descending",
"base_nl": "List all ongoing projects that have no end date set. Return id, name, start_date, budget, sorted by budget descending.",
"sql": (
"SELECT id, name, start_date, budget "
"FROM projects "
"WHERE end_date IS NULL "
"ORDER BY budget DESC"
),
},
# ββ MEDIUM βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
{
"domain": "hr", "difficulty": "medium", "has_order": True,
"description": "Headcount and average salary per department for active employees",
"base_nl": "For each department, what is the headcount and average salary of active employees? Return department_name, headcount, avg_salary (rounded to 2 decimal places), sorted by headcount descending.",
"sql": (
"SELECT d.name AS department_name, COUNT(e.id) AS headcount, "
" ROUND(AVG(e.salary), 2) AS avg_salary "
"FROM departments d "
"LEFT JOIN employees e ON e.department_id = d.id AND e.status = 'active' "
"GROUP BY d.id, d.name "
"ORDER BY headcount DESC"
),
},
{
"domain": "hr", "difficulty": "medium", "has_order": True,
"description": "Average performance rating per employee sorted descending",
"base_nl": "What is the average performance review rating per active employee? Return employee_name, job_title, avg_rating (rounded to 2 decimal places), sorted by avg_rating descending.",
"sql": (
"SELECT e.name AS employee_name, e.job_title, "
" ROUND(AVG(pr.rating), 2) AS avg_rating "
"FROM employees e "
"JOIN performance_reviews pr ON pr.employee_id = e.id "
"WHERE e.status = 'active' "
"GROUP BY e.id, e.name, e.job_title "
"ORDER BY avg_rating DESC"
),
},
{
"domain": "hr", "difficulty": "medium", "has_order": True,
"description": "Employees with the most total allocated project hours",
"base_nl": "Which employees have the most total hours allocated across projects? Return employee_name, total_hours, sorted by total_hours descending, top 10.",
"sql": (
"SELECT e.name AS employee_name, SUM(pa.hours_allocated) AS total_hours "
"FROM employees e "
"JOIN project_assignments pa ON pa.employee_id = e.id "
"GROUP BY e.id, e.name "
"ORDER BY total_hours DESC "
"LIMIT 10"
),
},
{
"domain": "hr", "difficulty": "medium", "has_order": True,
"description": "Departments with distinct employees assigned to active projects",
"base_nl": "For each department, how many distinct employees are assigned to active projects? Return department_name, assigned_employees, sorted by assigned_employees descending.",
"sql": (
"SELECT d.name AS department_name, "
" COUNT(DISTINCT pa.employee_id) AS assigned_employees "
"FROM departments d "
"JOIN projects p ON p.department_id = d.id "
"JOIN project_assignments pa ON pa.project_id = p.id "
"WHERE p.status = 'active' "
"GROUP BY d.id, d.name "
"ORDER BY assigned_employees DESC"
),
},
{
"domain": "hr", "difficulty": "medium", "has_order": True,
"description": "Total project budget per department sorted descending",
"base_nl": "What is the total project budget per department? Return department_name, total_project_budget (rounded to 2 decimal places), sorted by total_project_budget descending.",
"sql": (
"SELECT d.name AS department_name, "
" ROUND(SUM(p.budget), 2) AS total_project_budget "
"FROM departments d "
"JOIN projects p ON p.department_id = d.id "
"GROUP BY d.id, d.name "
"ORDER BY total_project_budget DESC"
),
},
# ββ HARD βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
{
"domain": "hr", "difficulty": "hard", "has_order": True,
"description": "Salary rank within department using DENSE_RANK",
"base_nl": "Rank active employees by salary within their department using DENSE_RANK (rank 1 = highest paid). Return employee_name, salary, department_name, salary_rank, sorted by department_name then salary_rank ascending.",
"sql": (
"SELECT employee_name, salary, department_name, "
" DENSE_RANK() OVER (PARTITION BY department_name ORDER BY salary DESC) AS salary_rank "
"FROM ( "
" SELECT e.name AS employee_name, e.salary, d.name AS department_name "
" FROM employees e "
" JOIN departments d ON d.id = e.department_id "
" WHERE e.status = 'active' "
") sub "
"ORDER BY department_name, salary_rank"
),
},
{
"domain": "hr", "difficulty": "hard", "has_order": True,
"description": "Employee performance band classification using CASE with avg rating CTE",
"base_nl": "Classify active employees into performance bands (High Performer: avg rating >= 4, Average: >= 3, Needs Improvement: < 3) based on their average review rating. Return employee_name, salary, avg_rating, performance_band, sorted by avg_rating descending.",
"sql": (
"WITH avg_ratings AS ( "
" SELECT employee_id, ROUND(AVG(rating), 2) AS avg_rating "
" FROM performance_reviews "
" GROUP BY employee_id "
") "
"SELECT e.name AS employee_name, e.salary, ar.avg_rating, "
" CASE WHEN ar.avg_rating >= 4 THEN 'High Performer' "
" WHEN ar.avg_rating >= 3 THEN 'Average' "
" ELSE 'Needs Improvement' "
" END AS performance_band "
"FROM employees e "
"JOIN avg_ratings ar ON ar.employee_id = e.id "
"WHERE e.status = 'active' "
"ORDER BY ar.avg_rating DESC"
),
},
{
"domain": "hr", "difficulty": "hard", "has_order": True,
"description": "Employees above their department average salary using CTE",
"base_nl": "Find active employees whose salary is above their department's average. Return employee_name, department_name, salary, dept_avg_salary (rounded to 2 decimal places), sorted by salary descending.",
"sql": (
"WITH dept_avg AS ( "
" SELECT department_id, ROUND(AVG(salary), 2) AS dept_avg_salary "
" FROM employees "
" WHERE status = 'active' "
" GROUP BY department_id "
") "
"SELECT e.name AS employee_name, d.name AS department_name, "
" e.salary, da.dept_avg_salary "
"FROM employees e "
"JOIN departments d ON d.id = e.department_id "
"JOIN dept_avg da ON da.department_id = e.department_id "
"WHERE e.status = 'active' AND e.salary > da.dept_avg_salary "
"ORDER BY e.salary DESC"
),
},
]
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# MASTER TEMPLATE REGISTRY
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
ALL_TEMPLATES: list[Template] = (
ECOMMERCE_TEMPLATES +
HEALTHCARE_TEMPLATES +
FINANCE_TEMPLATES +
HR_TEMPLATES
)
TEMPLATES_BY_DOMAIN: dict[str, list[Template]] = {
"ecommerce": ECOMMERCE_TEMPLATES,
"healthcare": HEALTHCARE_TEMPLATES,
"finance": FINANCE_TEMPLATES,
"hr": HR_TEMPLATES,
}
TEMPLATES_BY_DIFFICULTY: dict[str, list[Template]] = {
"easy": [t for t in ALL_TEMPLATES if t["difficulty"] == "easy"],
"medium": [t for t in ALL_TEMPLATES if t["difficulty"] == "medium"],
"hard": [t for t in ALL_TEMPLATES if t["difficulty"] == "hard"],
}
def template_stats() -> dict:
stats: dict = {"total": len(ALL_TEMPLATES), "by_domain": {}, "by_difficulty": {}}
for d in ["ecommerce","healthcare","finance","hr"]:
stats["by_domain"][d] = len(TEMPLATES_BY_DOMAIN[d])
for diff in ["easy","medium","hard"]:
stats["by_difficulty"][diff] = len(TEMPLATES_BY_DIFFICULTY[diff])
return stats
|