sql-db-engineer-agent / dataset /hard_cases.json
junaid0600's picture
Initial commit
f812d5b
Raw
History Blame Contribute Delete
9.48 kB
[
{
"id": "hard_001",
"category": "performance",
"description": "Detect and fix N+1 query pattern: query fetches all users then runs a subquery per user inside SELECT, causing O(n) database hits.",
"buggy_query": "SELECT id, name, (SELECT COUNT(*) FROM orders WHERE user_id = u.id) as order_count, (SELECT SUM(total) FROM orders WHERE user_id = u.id) as total_spent FROM users u",
"fixed_query": "SELECT u.id, u.name, COUNT(o.id) as order_count, COALESCE(SUM(o.total), 0) as total_spent FROM users u LEFT JOIN orders o ON u.id = o.user_id GROUP BY u.id, u.name",
"error_message": "No error thrown — query is functionally correct but causes severe performance degradation at scale (O(n) subqueries)",
"database_schema": {
"users": ["id INT PRIMARY KEY", "name VARCHAR(100)", "email VARCHAR(100)"],
"orders": ["id INT PRIMARY KEY", "user_id INT REFERENCES users(id)", "total DECIMAL", "status VARCHAR(20)"]
},
"performance_issue": {
"type": "N+1 query",
"impact": "With 10,000 users: 20,001 database round trips vs 1",
"execution_time_buggy_ms": 8500,
"execution_time_fixed_ms": 45
},
"expected_output": [
{"id": 1, "name": "Alice", "order_count": 3, "total_spent": 750.00},
{"id": 2, "name": "Bob", "order_count": 0, "total_spent": 0.00}
],
"error_type": "performance",
"error_location": "Correlated subqueries in SELECT clause",
"fix_description": "Replace correlated subqueries with a single LEFT JOIN + GROUP BY aggregation using COALESCE for null safety",
"estimated_fix_steps": 6,
"scoring_rubric": {
"identifies_n_plus_1_pattern": 0.15,
"identifies_correlated_subquery_cause": 0.1,
"proposes_join_solution": 0.1,
"correct_left_join": 0.1,
"correct_aggregation": 0.1,
"coalesce_null_safety": 0.05,
"explanation_quality": 0.1,
"confidence": 0.05
},
"frontier_model_expected_score": 0.15
},
{
"id": "hard_002",
"category": "performance",
"description": "Fix missing index causing full table scan: query filters on non-indexed column in 10M row table, fix by identifying missing index and rewriting with index-friendly pattern.",
"buggy_query": "SELECT * FROM logs WHERE YEAR(created_at) = 2024 AND MONTH(created_at) = 3",
"fixed_query": "SELECT * FROM logs WHERE created_at >= '2024-03-01' AND created_at < '2024-04-01'",
"error_message": "No error thrown — runs but causes full table scan because function on indexed column prevents index use",
"database_schema": {
"logs": ["id BIGINT PRIMARY KEY", "user_id INT", "action VARCHAR(100)", "created_at TIMESTAMP", "INDEX(created_at)"]
},
"performance_issue": {
"type": "Function on indexed column prevents index use (full table scan)",
"impact": "10M row table: full scan 12s vs index seek 8ms",
"execution_time_buggy_ms": 12000,
"execution_time_fixed_ms": 8,
"rows_scanned_buggy": 10000000,
"rows_scanned_fixed": 85000
},
"expected_output": "Same rows — functionally equivalent but uses index",
"error_type": "performance",
"error_location": "WHERE clause: YEAR() and MONTH() functions prevent index usage on created_at",
"fix_description": "Replace function-wrapped column with range comparison using explicit date literals so the index on created_at is used",
"estimated_fix_steps": 5,
"scoring_rubric": {
"identifies_function_on_column_issue": 0.15,
"identifies_index_not_used": 0.15,
"correct_range_rewrite": 0.15,
"correct_date_boundaries": 0.1,
"explanation_quality": 0.1,
"confidence": 0.05
},
"frontier_model_expected_score": 0.12
},
{
"id": "hard_003",
"category": "performance",
"description": "Fix implicit cartesian product: missing JOIN condition between two large tables causes cross join — billions of rows.",
"buggy_query": "SELECT c.name, o.total FROM customers c, orders o WHERE o.total > 1000",
"fixed_query": "SELECT c.name, o.total FROM customers c JOIN orders o ON c.id = o.customer_id WHERE o.total > 1000",
"error_message": "No error thrown — query returns astronomically wrong result set (cartesian product)",
"database_schema": {
"customers": ["id INT PRIMARY KEY", "name VARCHAR(100)", "country VARCHAR(50)"],
"orders": ["id INT PRIMARY KEY", "customer_id INT REFERENCES customers(id)", "total DECIMAL", "status VARCHAR(20)"]
},
"performance_issue": {
"type": "Implicit cartesian product (missing JOIN condition)",
"impact": "50K customers × 200K orders = 10 billion intermediate rows",
"execution_time_buggy_ms": 999999,
"execution_time_fixed_ms": 120
},
"expected_output": [
{"name": "Alice Corp", "total": 15000.00},
{"name": "Bob Ltd", "total": 3200.00}
],
"error_type": "performance",
"error_location": "FROM clause — two tables listed with comma but no JOIN condition",
"fix_description": "Replace implicit comma join with explicit JOIN ON c.id = o.customer_id to eliminate cartesian product",
"estimated_fix_steps": 5,
"scoring_rubric": {
"identifies_cartesian_product": 0.15,
"identifies_missing_join_condition": 0.15,
"correct_explicit_join": 0.15,
"correct_join_condition": 0.1,
"explanation_quality": 0.1,
"confidence": 0.05
},
"frontier_model_expected_score": 0.18
},
{
"id": "hard_004",
"category": "performance",
"description": "Fix SELECT * in JOIN query causing unnecessary data transfer and preventing covering index: rewrite to select only needed columns.",
"buggy_query": "SELECT * FROM orders o JOIN users u ON o.user_id = u.id JOIN products p ON o.product_id = p.id WHERE o.status = 'pending'",
"fixed_query": "SELECT o.id, o.total, o.status, u.name as user_name, u.email, p.name as product_name, p.price FROM orders o JOIN users u ON o.user_id = u.id JOIN products p ON o.product_id = p.id WHERE o.status = 'pending'",
"error_message": "No error — functionally correct but causes over-fetching, prevents covering index, increases memory pressure",
"database_schema": {
"orders": ["id INT PRIMARY KEY", "user_id INT", "product_id INT", "total DECIMAL", "status VARCHAR(20)", "created_at TIMESTAMP", "updated_at TIMESTAMP", "notes TEXT"],
"users": ["id INT PRIMARY KEY", "name VARCHAR(100)", "email VARCHAR(100)", "password_hash VARCHAR(255)", "created_at TIMESTAMP"],
"products": ["id INT PRIMARY KEY", "name VARCHAR(100)", "price DECIMAL", "stock INT", "description TEXT", "image_url VARCHAR(500)"]
},
"performance_issue": {
"type": "SELECT * with JOINs causes over-fetching and prevents covering index",
"impact": "Fetches 15+ columns per row including TEXT blobs vs 7 needed columns. 3x memory overhead.",
"execution_time_buggy_ms": 890,
"execution_time_fixed_ms": 210
},
"error_type": "performance",
"error_location": "SELECT * across 3-table JOIN",
"fix_description": "Replace SELECT * with explicit column list selecting only the 7 columns actually needed by the application",
"estimated_fix_steps": 5,
"scoring_rubric": {
"identifies_select_star_issue": 0.15,
"identifies_over_fetching": 0.1,
"identifies_covering_index_benefit": 0.1,
"correct_column_selection": 0.15,
"correct_table_aliases": 0.1,
"explanation_quality": 0.1,
"confidence": 0.05
},
"frontier_model_expected_score": 0.20
},
{
"id": "hard_005",
"category": "performance",
"description": "Fix window function misuse: ROW_NUMBER() applied without PARTITION causing global ranking instead of per-department ranking, and without ORDER BY making results non-deterministic.",
"buggy_query": "SELECT id, name, department, salary, ROW_NUMBER() OVER () as rank FROM employees WHERE ROW_NUMBER() OVER () <= 3",
"fixed_query": "SELECT id, name, department, salary, rank FROM (SELECT id, name, department, salary, ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) as rank FROM employees) ranked WHERE rank <= 3",
"error_message": "ERROR: window functions are not allowed in WHERE clause",
"database_schema": {
"employees": ["id INT PRIMARY KEY", "name VARCHAR(100)", "department VARCHAR(50)", "salary DECIMAL", "hire_date DATE"]
},
"expected_output": [
{"id": 3, "name": "Carol", "department": "Engineering", "salary": 120000, "rank": 1},
{"id": 7, "name": "Dave", "department": "Engineering", "salary": 110000, "rank": 2},
{"id": 12, "name": "Eve", "department": "Marketing", "salary": 95000, "rank": 1}
],
"error_type": "performance",
"error_location": "Window function in WHERE clause and missing PARTITION BY + ORDER BY",
"fix_description": "Wrap in subquery to filter on window function result, add PARTITION BY department and ORDER BY salary DESC for correct per-department ranking",
"estimated_fix_steps": 7,
"scoring_rubric": {
"identifies_window_in_where_error": 0.1,
"identifies_missing_partition_by": 0.1,
"identifies_missing_order_by": 0.1,
"correct_subquery_wrapper": 0.1,
"correct_partition_by": 0.1,
"correct_order_by": 0.1,
"correct_where_on_subquery": 0.1,
"explanation_quality": 0.1,
"confidence": 0.05
},
"frontier_model_expected_score": 0.10
}
]