Spaces:
Running
Running
File size: 6,567 Bytes
30cf758 bc9f459 30cf758 bc9f459 30cf758 bc9f459 30cf758 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 | """
TASK 2 β MEDIUM: Logic Error Fix
Difficulty: Medium
Bug types: Wrong JOIN type causing missing rows, incorrect aggregation logic,
missing HAVING clause, wrong date filter
Max steps: 20
Expected baseline model score: 0.3-0.6
"""
from typing import List, Dict, Any
from .base import BaseTask
class MediumTask(BaseTask):
"""
Scenario: HR analytics team wants monthly headcount and average salary
by department for the current year, including departments with zero employees
(i.e., departments that exist but no one joined this year).
Bugs:
1. Uses INNER JOIN instead of LEFT JOIN β excludes empty departments
2. Uses AVG(salary) over all employees instead of only those who joined this year
3. Missing: the date filter for 'this year' is applied in WHERE, breaking the LEFT JOIN
(should be in ON clause or use CASE)
4. GROUP BY missing department_id (ambiguous grouping)
"""
@property
def task_id(self) -> str:
return "medium_logic_fix"
@property
def name(self) -> str:
return "Department Headcount Report β Logic Error Fix"
@property
def difficulty(self) -> str:
return "medium"
@property
def description(self) -> str:
return """You are debugging a HR analytics SQL query.
The query should produce a monthly department headcount report showing:
- department_name
- headcount: number of employees who joined IN 2023
- avg_salary: average salary of employees who joined IN 2023
- All departments must appear, even those with 0 new hires in 2023
The current query has 3 logic bugs:
1. It uses the wrong JOIN type, which silently drops departments with no 2023 hires
2. The WHERE clause on hire_date breaks the outer join semantics
3. The AVG calculation includes employees from all years, not just 2023
Fix these logic errors. The result should be ordered by department_name ascending."""
@property
def expected_output_description(self) -> str:
return "4 rows (all departments), headcount=0 for 'Legal', correct avg_salary only from 2023 hires."
@property
def broken_query(self) -> str:
return """SELECT
d.name AS department_name,
COUNT(e.id) AS headcount,
ROUND(AVG(e.salary), 2) AS avg_salary
FROM departments d
INNER JOIN employees e ON d.id = e.department_id
WHERE strftime('%Y', e.hire_date) = '2023'
GROUP BY d.name
ORDER BY department_name ASC"""
@property
def schema_sql(self) -> str:
return """
CREATE TABLE departments (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
budget REAL
);
CREATE TABLE employees (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
department_id INTEGER NOT NULL,
salary REAL NOT NULL,
hire_date TEXT NOT NULL,
FOREIGN KEY (department_id) REFERENCES departments(id)
)"""
@property
def seed_data_sql(self) -> str:
return """
INSERT INTO departments VALUES (1,'Engineering',500000);
INSERT INTO departments VALUES (2,'Marketing',200000);
INSERT INTO departments VALUES (3,'Sales',300000);
INSERT INTO departments VALUES (4,'Legal',150000);
INSERT INTO employees VALUES (1,'Ana Lima',1,95000,'2023-03-15');
INSERT INTO employees VALUES (2,'Ben Sharma',1,102000,'2023-06-01');
INSERT INTO employees VALUES (3,'Chris Wang',1,88000,'2022-01-10');
INSERT INTO employees VALUES (4,'Diana Patel',2,72000,'2023-04-20');
INSERT INTO employees VALUES (5,'Erik Johnson',2,68000,'2022-11-05');
INSERT INTO employees VALUES (6,'Fatima Al-Hassan',3,55000,'2023-01-08');
INSERT INTO employees VALUES (7,'George Okafor',3,61000,'2023-07-22');
INSERT INTO employees VALUES (8,'Hannah Kim',3,58000,'2022-05-30');
INSERT INTO employees VALUES (9,'Ivan Petrov',1,91000,'2022-08-14')"""
@property
def expected_output(self) -> List[Dict[str, Any]]:
# Engineering 2023 hires: Ana 95000, Ben 102000 β count=2, avg=98500
# Marketing 2023 hires: Diana 72000 β count=1, avg=72000
# Sales 2023 hires: Fatima 55000, George 61000 β count=2, avg=58000
# Legal 2023 hires: none β count=0, avg=NULL
return [
{"department_name": "Engineering", "headcount": 2, "avg_salary": 98500.00},
{"department_name": "Legal", "headcount": 0, "avg_salary": None},
{"department_name": "Marketing", "headcount": 1, "avg_salary": 72000.00},
{"department_name": "Sales", "headcount": 2, "avg_salary": 58000.00},
]
@property
def hint(self) -> str:
return "Hint: When you want ALL rows from the left table even when there's no match on the right, think about which JOIN type preserves those rows. Also, WHERE on a nullable column after a join changes join semantics β consider moving that condition."
class MediumTaskGrader:
"""
Custom grader for medium task β handles NULL comparison.
"""
_MIN_STRICT_SCORE = 0.001
_MAX_STRICT_SCORE = 0.999
@staticmethod
def _strict_score(score: float) -> float:
return round(
min(MediumTaskGrader._MAX_STRICT_SCORE, max(MediumTaskGrader._MIN_STRICT_SCORE, score)),
3,
)
@staticmethod
def grade(actual: List[Dict]) -> float:
if not actual or len(actual) != 4:
return MediumTaskGrader._strict_score(0.0)
# Sort both by dept name for comparison
actual_sorted = sorted(actual, key=lambda r: r.get("department_name", ""))
expected = [
{"department_name": "Engineering", "headcount": 2, "avg_salary": 98500.00},
{"department_name": "Legal", "headcount": 0, "avg_salary": None},
{"department_name": "Marketing", "headcount": 1, "avg_salary": 72000.00},
{"department_name": "Sales", "headcount": 2, "avg_salary": 58000.00},
]
matches = 0
for a, e in zip(actual_sorted, expected):
dept_ok = str(a.get("department_name","")).lower() == str(e["department_name"]).lower()
count_ok = int(a.get("headcount", -1)) == e["headcount"]
e_salary = e["avg_salary"]
a_salary = a.get("avg_salary")
if e_salary is None:
salary_ok = a_salary is None or a_salary == 0
else:
try:
salary_ok = abs(float(a_salary) - float(e_salary)) < 1.0
except (TypeError, ValueError):
salary_ok = False
if dept_ok and count_ok and salary_ok:
matches += 1
return MediumTaskGrader._strict_score(matches / 4)
|