Spaces:
Running
Running
Commit Β·
2650443
1
Parent(s): 2f50e0a
fixed partition by , top group
Browse files- ai/sql_pattern_checker.py +60 -0
ai/sql_pattern_checker.py
CHANGED
|
@@ -232,6 +232,66 @@ def check_sql_patterns(sql: str) -> list[dict[str, Any]]:
|
|
| 232 |
),
|
| 233 |
})
|
| 234 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
# ββ Pattern 2a βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 236 |
# Cumulative/running window applied directly to raw table rows without
|
| 237 |
# pre-aggregating by date. SUM(...) OVER (ORDER BY date) on a raw scan
|
|
|
|
| 232 |
),
|
| 233 |
})
|
| 234 |
|
| 235 |
+
# ββ Pattern 1b βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 236 |
+
# "Top X per group" answered as a global sort instead of PARTITION BY ranking.
|
| 237 |
+
#
|
| 238 |
+
# Symptom: GROUP BY has 2+ columns, ORDER BY present, no LIMIT (all rows
|
| 239 |
+
# returned) and no window ranking function (ROW_NUMBER/RANK/DENSE_RANK/
|
| 240 |
+
# PARTITION BY). This returns every group-entity combination sorted globally
|
| 241 |
+
# instead of the top-1 (or top-N) within each group.
|
| 242 |
+
#
|
| 243 |
+
# Example: "top customer per city"
|
| 244 |
+
# WRONG: GROUP BY city, customer ORDER BY revenue DESC β 126 rows (all)
|
| 245 |
+
# RIGHT: ROW_NUMBER() OVER (PARTITION BY city ORDER BY revenue DESC), WHERE rnk=1
|
| 246 |
+
has_window_ranking = bool(re.search(
|
| 247 |
+
r"\b(?:row_number|rank|dense_rank)\s*\(|\bpartition\s+by\b",
|
| 248 |
+
sql_lower,
|
| 249 |
+
))
|
| 250 |
+
has_limit = bool(re.search(r"\blimit\s+\d+", sql_lower))
|
| 251 |
+
has_order_by = bool(re.search(r"\border\s+by\b", sql_lower))
|
| 252 |
+
|
| 253 |
+
if not has_window_ranking and has_order_by and not has_limit:
|
| 254 |
+
# Count distinct columns in GROUP BY clause
|
| 255 |
+
group_by_match = re.search(r"\bgroup\s+by\b(.+?)(?:\border\s+by\b|\blimit\b|\bhaving\b|$)",
|
| 256 |
+
sql_lower, re.DOTALL)
|
| 257 |
+
if group_by_match:
|
| 258 |
+
group_cols = [c.strip() for c in group_by_match.group(1).split(",") if c.strip()]
|
| 259 |
+
if len(group_cols) >= 2:
|
| 260 |
+
issues.append({
|
| 261 |
+
"pattern_name": "top_per_group_missing_partition_by",
|
| 262 |
+
"description": (
|
| 263 |
+
"POSSIBLE WRONG RESULT β 'top per group' answered as a global sort. "
|
| 264 |
+
"The query uses GROUP BY with multiple columns and ORDER BY, but has "
|
| 265 |
+
"no PARTITION BY or ROW_NUMBER/RANK window function and no LIMIT. "
|
| 266 |
+
"This returns ALL rows sorted globally β not one top row per group. "
|
| 267 |
+
"For questions like 'top customer per city' or 'best product per category', "
|
| 268 |
+
"you must use ROW_NUMBER() OVER (PARTITION BY group_col ORDER BY metric DESC) "
|
| 269 |
+
"in a subquery, then filter WHERE rnk = 1 outside."
|
| 270 |
+
),
|
| 271 |
+
"correction": (
|
| 272 |
+
"Re-read the question. If it asks for the top item WITHIN each group "
|
| 273 |
+
"(e.g. 'per city', 'per category', 'for each X'), use this pattern:\n"
|
| 274 |
+
"\n"
|
| 275 |
+
"SELECT group_col, entity_col, metric\n"
|
| 276 |
+
"FROM (\n"
|
| 277 |
+
" SELECT group_col, entity_col,\n"
|
| 278 |
+
" SUM(metric_col) AS metric,\n"
|
| 279 |
+
" ROW_NUMBER() OVER (\n"
|
| 280 |
+
" PARTITION BY group_col\n"
|
| 281 |
+
" ORDER BY SUM(metric_col) DESC\n"
|
| 282 |
+
" ) AS rnk\n"
|
| 283 |
+
" FROM ...\n"
|
| 284 |
+
" WHERE so.status = 'closed'\n"
|
| 285 |
+
" GROUP BY group_col, entity_col\n"
|
| 286 |
+
") t\n"
|
| 287 |
+
"WHERE rnk = 1\n"
|
| 288 |
+
"ORDER BY metric DESC\n"
|
| 289 |
+
"\n"
|
| 290 |
+
"If the question asks for a global top (not per group), add LIMIT N "
|
| 291 |
+
"to the original query instead."
|
| 292 |
+
),
|
| 293 |
+
})
|
| 294 |
+
|
| 295 |
# ββ Pattern 2a βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 296 |
# Cumulative/running window applied directly to raw table rows without
|
| 297 |
# pre-aggregating by date. SUM(...) OVER (ORDER BY date) on a raw scan
|