Spaces:
Sleeping
Sleeping
File size: 5,667 Bytes
5db060f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | # tasks/grader_hard.py
"""
Grader for optimize_001 β replace correlated subquery with CTE.
Unlike easy/medium, there are no fixed expected_rows.
Score is entirely driven by query plan quality:
- uses WITH (CTE)
- uses GROUP BY
- uses AVG(
- does NOT use correlated subquery pattern
- executes without error
"""
def grade(
task: dict,
agent_query: str,
run_result: dict,
prev_absolute_score: float = 0.0,
step_count: int = 1,
max_steps: int = 10,
) -> dict:
syntax_ok = run_result["error"] is None
# ββ Syntax ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
if not syntax_ok:
absolute_score = 0.05
delta = round(
max(-0.3, min(0.5, absolute_score - prev_absolute_score)), 4
)
return {
"value": delta,
"absolute_score": absolute_score,
"syntax_ok": False,
"result_score": 0.0,
"plan_score": 0.0,
"delta": delta,
"status": "syntax_error",
"feedback": f"syntax_error: {run_result['error'][:100]}",
"message": f"syntax_error | abs=0.050 | delta={delta:+.3f}",
}
query_upper = agent_query.upper()
# good_patterns = task.get("good_patterns", ["WITH", "GROUP BY", "AVG("])
# ββ Plan component scores βββββββββββββββββββββββββββββββββββββββββββββββββ
# 1. Uses CTE (WITH keyword) β most important signal
has_cte = "WITH" in query_upper
cte_score = 1.0 if has_cte else 0.0
# 2. Uses GROUP BY β required for computing per-user average
has_group_by = "GROUP BY" in query_upper
group_score = 1.0 if has_group_by else 0.0
# 3. Uses AVG β must be aggregating correctly
has_avg = "AVG(" in query_upper
avg_score = 1.0 if has_avg else 0.0
# 4. Correlated subquery penalty β still using the slow pattern
still_correlated = (
"SELECT AVG" in query_upper
and "WHERE" in query_upper
and not has_cte # WITH overrides this penalty
)
correlation_penalty = 0.4 if still_correlated else 0.0
# 5. Execution quality β did the query actually return rows?
rows_returned = len(run_result["rows"])
execution_score = 1.0 if rows_returned > 0 else 0.3
# 0.3 credit for running without error even if empty result
# ββ Plan score weighted combination βββββββββββββββββββββββββββββββββββββββ
# CTE 40% + GROUP BY 25% + AVG 20% + execution 15%
plan_score = round(
max(
0.0,
0.40 * cte_score
+ 0.25 * group_score
+ 0.20 * avg_score
+ 0.15 * execution_score
- correlation_penalty,
),
4,
)
# ββ Efficiency bonus ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
steps_remaining = max_steps - step_count
efficiency_bonus = 0.0
if plan_score >= 0.85:
efficiency_bonus = round(0.05 * (steps_remaining / max_steps), 4)
# ββ Absolute score β hard: syntax 10% + plan 85% + bonus 5% βββββββββββββ
absolute_score = round(
min(0.99, 0.10 * 1.0 + 0.85 * plan_score + efficiency_bonus), 4
)
absolute_score = max(0.05, absolute_score)
# ββ Delta βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
delta = absolute_score - prev_absolute_score
if abs(delta) < 0.001 and step_count > 1:
delta -= 0.02
delta = round(max(-0.3, min(0.5, delta)), 4)
# ββ Feedback βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
issues = []
if not has_cte:
issues.append("missing_cte: query needs WITH clause to precompute averages")
if not has_group_by:
issues.append("missing_group_by: need GROUP BY user_id to compute per-user avg")
if not has_avg:
issues.append("missing_avg: need AVG(amount) in the CTE")
if still_correlated:
issues.append("still_correlated: subquery in WHERE runs per-row β move to CTE")
if rows_returned == 0 and syntax_ok:
issues.append("empty_result: query runs but returns no rows β check JOIN and WHERE")
feedback = "; ".join(issues) if issues else "plan looks optimized"
status = (
"solved" if absolute_score >= 0.99
else "improving" if delta > 0.01
else "regression" if delta < -0.01
else "stalled"
)
return {
"value": delta,
"absolute_score": absolute_score,
"syntax_ok": True,
"result_score": execution_score,
"plan_score": plan_score,
"delta": delta,
"status": status,
"feedback": feedback,
"message": (
f"{status} | abs={absolute_score:.3f} | delta={delta:+.3f} | "
f"plan={plan_score:.0%} | cte={has_cte} | group={has_group_by}"
),
} |