jashdoshi77 commited on
Commit
2650443
Β·
1 Parent(s): 2f50e0a

fixed partition by , top group

Browse files
Files changed (1) hide show
  1. ai/sql_pattern_checker.py +60 -0
ai/sql_pattern_checker.py CHANGED
@@ -232,6 +232,66 @@ def check_sql_patterns(sql: str) -> list[dict[str, Any]]:
232
  ),
233
  })
234
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  # ── Pattern 2a ───────────────────────────────────────────────────────────
236
  # Cumulative/running window applied directly to raw table rows without
237
  # pre-aggregating by date. SUM(...) OVER (ORDER BY date) on a raw scan
 
232
  ),
233
  })
234
 
235
+ # ── Pattern 1b ───────────────────────────────────────────────────────────
236
+ # "Top X per group" answered as a global sort instead of PARTITION BY ranking.
237
+ #
238
+ # Symptom: GROUP BY has 2+ columns, ORDER BY present, no LIMIT (all rows
239
+ # returned) and no window ranking function (ROW_NUMBER/RANK/DENSE_RANK/
240
+ # PARTITION BY). This returns every group-entity combination sorted globally
241
+ # instead of the top-1 (or top-N) within each group.
242
+ #
243
+ # Example: "top customer per city"
244
+ # WRONG: GROUP BY city, customer ORDER BY revenue DESC β†’ 126 rows (all)
245
+ # RIGHT: ROW_NUMBER() OVER (PARTITION BY city ORDER BY revenue DESC), WHERE rnk=1
246
+ has_window_ranking = bool(re.search(
247
+ r"\b(?:row_number|rank|dense_rank)\s*\(|\bpartition\s+by\b",
248
+ sql_lower,
249
+ ))
250
+ has_limit = bool(re.search(r"\blimit\s+\d+", sql_lower))
251
+ has_order_by = bool(re.search(r"\border\s+by\b", sql_lower))
252
+
253
+ if not has_window_ranking and has_order_by and not has_limit:
254
+ # Count distinct columns in GROUP BY clause
255
+ group_by_match = re.search(r"\bgroup\s+by\b(.+?)(?:\border\s+by\b|\blimit\b|\bhaving\b|$)",
256
+ sql_lower, re.DOTALL)
257
+ if group_by_match:
258
+ group_cols = [c.strip() for c in group_by_match.group(1).split(",") if c.strip()]
259
+ if len(group_cols) >= 2:
260
+ issues.append({
261
+ "pattern_name": "top_per_group_missing_partition_by",
262
+ "description": (
263
+ "POSSIBLE WRONG RESULT β€” 'top per group' answered as a global sort. "
264
+ "The query uses GROUP BY with multiple columns and ORDER BY, but has "
265
+ "no PARTITION BY or ROW_NUMBER/RANK window function and no LIMIT. "
266
+ "This returns ALL rows sorted globally β€” not one top row per group. "
267
+ "For questions like 'top customer per city' or 'best product per category', "
268
+ "you must use ROW_NUMBER() OVER (PARTITION BY group_col ORDER BY metric DESC) "
269
+ "in a subquery, then filter WHERE rnk = 1 outside."
270
+ ),
271
+ "correction": (
272
+ "Re-read the question. If it asks for the top item WITHIN each group "
273
+ "(e.g. 'per city', 'per category', 'for each X'), use this pattern:\n"
274
+ "\n"
275
+ "SELECT group_col, entity_col, metric\n"
276
+ "FROM (\n"
277
+ " SELECT group_col, entity_col,\n"
278
+ " SUM(metric_col) AS metric,\n"
279
+ " ROW_NUMBER() OVER (\n"
280
+ " PARTITION BY group_col\n"
281
+ " ORDER BY SUM(metric_col) DESC\n"
282
+ " ) AS rnk\n"
283
+ " FROM ...\n"
284
+ " WHERE so.status = 'closed'\n"
285
+ " GROUP BY group_col, entity_col\n"
286
+ ") t\n"
287
+ "WHERE rnk = 1\n"
288
+ "ORDER BY metric DESC\n"
289
+ "\n"
290
+ "If the question asks for a global top (not per group), add LIMIT N "
291
+ "to the original query instead."
292
+ ),
293
+ })
294
+
295
  # ── Pattern 2a ───────────────────────────────────────────────────────────
296
  # Cumulative/running window applied directly to raw table rows without
297
  # pre-aggregating by date. SUM(...) OVER (ORDER BY date) on a raw scan