jashdoshi77 commited on
Commit
2f50e0a
Β·
1 Parent(s): 29bf2eb

window function testing

Browse files
Files changed (2) hide show
  1. ai/signatures.py +94 -0
  2. ai/sql_pattern_checker.py +74 -0
ai/signatures.py CHANGED
@@ -160,6 +160,86 @@ class AnalyzeAndPlan(dspy.Signature):
160
  There is no product_master, products, or product_catalog table.
161
  Use product_id as the only product identifier. Never invent table names.
162
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  ═══════════════════���══════════════════════════════════════════
164
  RULE 1D0 β€” PERCENTAGE / RATIO WITH CASE WHEN β€” NEVER PRE-FILTER STATUS
165
  ══════════════════════════════════════════════════════════════
@@ -543,6 +623,20 @@ class SQLGeneration(dspy.Signature):
543
 
544
  4d. NO product_master table β€” never reference it; use product_id only.
545
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
546
  4d0. PERCENTAGE WITH CASE WHEN β€” never add WHERE status filter on the same column:
547
  When splitting by status with CASE WHEN, the denominator must include ALL rows.
548
  WRONG: WHERE status IN ('closed','cancelled') ... SUM(total_amount) as denominator
 
160
  There is no product_master, products, or product_catalog table.
161
  Use product_id as the only product identifier. Never invent table names.
162
 
163
+ ══════════════════════════════════════════════════════════════
164
+ RULE 1C0 β€” "TOP/BEST PER GROUP" REQUIRES ROW_NUMBER PARTITION BY
165
+ ══════════════════════════════════════════════════════════════
166
+ Questions like "top customer per city", "best product per category",
167
+ "highest revenue vendor per region" are PER-GROUP ranking problems.
168
+ A global ORDER BY + LIMIT returns the global top β€” NOT one per group.
169
+
170
+ WRONG (global sort β€” returns all rows or wrong subset):
171
+ SELECT city, customer_id, SUM(total_amount) AS rev
172
+ FROM ... GROUP BY city, customer_id
173
+ ORDER BY rev DESC ← sorts globally, does NOT pick one per city
174
+
175
+ CORRECT (ROW_NUMBER partitioned by the group column, filter rank = 1):
176
+ SELECT city, customer_id, customer_name, total_revenue
177
+ FROM (
178
+ SELECT cm.city, cm.customer_id, cm.customer_name,
179
+ SUM(so.total_amount) AS total_revenue,
180
+ ROW_NUMBER() OVER (PARTITION BY cm.city
181
+ ORDER BY SUM(so.total_amount) DESC) AS rnk
182
+ FROM sales_table_v2_sales_order so
183
+ JOIN sales_table_v2_customer_master cm ON so.customer_id = cm.customer_id
184
+ WHERE so.status = 'closed'
185
+ GROUP BY cm.city, cm.customer_id, cm.customer_name
186
+ ) t
187
+ WHERE rnk = 1
188
+ ORDER BY total_revenue DESC
189
+
190
+ Trigger words: "per city", "per region", "per category", "for each X … top/best/highest".
191
+
192
+ ══════════════════════════════════════════════════════════════
193
+ RULE 1C1 β€” "TOP N FOR BOTH X AND Y" REQUIRES TWO INDEPENDENT RANKs
194
+ ══════════════════════════════════════════════════════════════
195
+ "Top 5 by revenue AND top 5 by diamond cost" means a product must be in
196
+ the top 5 on EACH metric independently.
197
+ ORDER BY revenue DESC, cost DESC LIMIT 5 is NOT two rankings β€” it ranks
198
+ by revenue and uses cost only as a tiebreaker, returning the wrong result.
199
+
200
+ WRONG:
201
+ ORDER BY revenue DESC, diamond_cost DESC LIMIT 5 ← not two rankings
202
+
203
+ CORRECT (two independent RANK() window functions, filter where both <= N):
204
+ SELECT product_id, revenue, diamond_cost, rev_rank, diamond_rank
205
+ FROM (
206
+ SELECT lp.product_id,
207
+ SUM(lp.line_total) AS revenue,
208
+ SUM(lp.diamond_amount_per_unit * lp.quantity) AS diamond_cost,
209
+ RANK() OVER (ORDER BY SUM(lp.line_total) DESC) AS rev_rank,
210
+ RANK() OVER (ORDER BY SUM(lp.diamond_amount_per_unit * lp.quantity) DESC)
211
+ AS diamond_rank
212
+ FROM sales_table_v2_sales_order_line_pricing lp
213
+ JOIN sales_table_v2_sales_order_line sol ON lp.sol_id = sol.sol_id
214
+ JOIN sales_table_v2_sales_order so ON sol.so_id = so.so_id
215
+ WHERE so.status = 'closed'
216
+ GROUP BY lp.product_id
217
+ ) t
218
+ WHERE rev_rank <= 5 AND diamond_rank <= 5
219
+
220
+ ══════════════════════════════════════════════════════════════
221
+ RULE 1C2 β€” CUMULATIVE/RUNNING WINDOW NEEDS PRE-AGGREGATION
222
+ ══════════════════════════════════════════════════════════════
223
+ Applying SUM(...) OVER (ORDER BY date) directly on raw order rows produces
224
+ one cumulative row per ORDER (not per date). Multiple orders on the same
225
+ date get separate cumulative values β€” wrong.
226
+ Always GROUP BY date first in a subquery, then apply the window on top.
227
+
228
+ WRONG (window over raw rows β€” one row per order, same date repeats):
229
+ SELECT order_date, SUM(total_amount) OVER (ORDER BY order_date) AS cum_rev
230
+ FROM sales_table_v2_sales_order WHERE status = 'closed'
231
+
232
+ CORRECT (aggregate by date first, then window):
233
+ SELECT order_date, daily_revenue,
234
+ SUM(daily_revenue) OVER (ORDER BY order_date) AS cumulative_revenue
235
+ FROM (
236
+ SELECT order_date::date AS order_date, SUM(total_amount) AS daily_revenue
237
+ FROM sales_table_v2_sales_order
238
+ WHERE status = 'closed'
239
+ GROUP BY order_date::date
240
+ ) t
241
+ ORDER BY order_date
242
+
243
  ═══════════════════���══════════════════════════════════════════
244
  RULE 1D0 β€” PERCENTAGE / RATIO WITH CASE WHEN β€” NEVER PRE-FILTER STATUS
245
  ══════════════════════════════════════════════════════════════
 
623
 
624
  4d. NO product_master table β€” never reference it; use product_id only.
625
 
626
+ 4c0. "TOP/BEST PER GROUP" β†’ use ROW_NUMBER() PARTITION BY the group column, filter rnk = 1.
627
+ WRONG: GROUP BY city, customer ORDER BY revenue DESC (global sort, not per-city top)
628
+ CORRECT: ROW_NUMBER() OVER (PARTITION BY city ORDER BY revenue DESC) AS rnk … WHERE rnk = 1
629
+
630
+ 4c1. "TOP N FOR BOTH X AND Y" β†’ two independent RANK() window functions, filter both <= N.
631
+ WRONG: ORDER BY revenue DESC, cost DESC LIMIT 5 (cost is just tiebreaker, not ranked)
632
+ CORRECT: RANK() OVER (ORDER BY revenue DESC) AS rev_rank,
633
+ RANK() OVER (ORDER BY cost DESC) AS cost_rank … WHERE rev_rank<=5 AND cost_rank<=5
634
+
635
+ 4c2. CUMULATIVE/RUNNING WINDOW β†’ always GROUP BY date first in a subquery, then apply window.
636
+ WRONG: SUM(total_amount) OVER (ORDER BY order_date) FROM sales_order (per-row window)
637
+ CORRECT: SUM(daily_revenue) OVER (ORDER BY order_date) FROM (SELECT order_date::date,
638
+ SUM(total_amount) AS daily_revenue FROM ... GROUP BY order_date::date) t
639
+
640
  4d0. PERCENTAGE WITH CASE WHEN β€” never add WHERE status filter on the same column:
641
  When splitting by status with CASE WHEN, the denominator must include ALL rows.
642
  WRONG: WHERE status IN ('closed','cancelled') ... SUM(total_amount) as denominator
ai/sql_pattern_checker.py CHANGED
@@ -232,6 +232,80 @@ def check_sql_patterns(sql: str) -> list[dict[str, Any]]:
232
  ),
233
  })
234
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  # ── Pattern 3a ───────────────────────────────────────────────────────────
236
  # WHERE status filter alongside CASE WHEN status β€” wrong denominator.
237
  # When computing "percentage of X vs Y", the WHERE clause must NOT pre-filter
 
232
  ),
233
  })
234
 
235
+ # ── Pattern 2a ───────────────────────────────────────────────────────────
236
+ # Cumulative/running window applied directly to raw table rows without
237
+ # pre-aggregating by date. SUM(...) OVER (ORDER BY date) on a raw scan
238
+ # produces one row per ORDER, not one per date.
239
+ # Detectable: OVER (ORDER BY ...) present + no subquery/CTE with GROUP BY.
240
+ if re.search(r"\bover\s*\(.*?order\s+by\b", sql_lower, re.DOTALL):
241
+ has_window = bool(re.search(r"\bsum\s*\([^)]+\)\s+over\s*\(", sql_lower))
242
+ # Count how many times SELECT appears β€” more than one means a subquery exists
243
+ select_count = len(re.findall(r"\bselect\b", sql_lower))
244
+ # GROUP BY anywhere in the SQL (covers both CTE and inline subquery patterns)
245
+ has_any_group_by = bool(re.search(r"\bgroup\s+by\b", sql_lower))
246
+ # If there's a subquery (multiple SELECTs) with GROUP BY, treat it as pre-aggregated
247
+ has_pre_aggregation = has_any_group_by and select_count > 1
248
+ if has_window and not has_pre_aggregation:
249
+ issues.append({
250
+ "pattern_name": "cumulative_window_without_pre_aggregation",
251
+ "description": (
252
+ "WRONG RESULT β€” SUM(...) OVER (ORDER BY date) applied directly to raw rows. "
253
+ "With multiple orders per date, the window produces one cumulative value "
254
+ "per ORDER ROW, not per date β€” same date appears multiple times with "
255
+ "different cumulative totals. The correct approach is to GROUP BY date "
256
+ "first in a subquery, then apply the cumulative window on top."
257
+ ),
258
+ "correction": (
259
+ "Aggregate by date first, then apply the window:\n"
260
+ "\n"
261
+ "CORRECT:\n"
262
+ "SELECT order_date, daily_revenue,\n"
263
+ " SUM(daily_revenue) OVER (ORDER BY order_date) AS cumulative_revenue\n"
264
+ "FROM (\n"
265
+ " SELECT order_date::date AS order_date,\n"
266
+ " SUM(total_amount) AS daily_revenue\n"
267
+ " FROM sales_table_v2_sales_order\n"
268
+ " WHERE status = 'closed'\n"
269
+ " GROUP BY order_date::date\n"
270
+ ") t\n"
271
+ "ORDER BY order_date"
272
+ ),
273
+ })
274
+
275
+ # ── Pattern 2b ───────────────────────────────────────────────────────────
276
+ # "Top N for BOTH metric A and metric B" β€” using ORDER BY a, b LIMIT N
277
+ # ranks by a (b is just tiebreaker). Needs two independent RANK() windows.
278
+ # Detectable: ORDER BY has two or more columns AND LIMIT present AND no RANK/ROW_NUMBER.
279
+ if (
280
+ re.search(r"\border\s+by\b[^;]+,", sql_lower) # ORDER BY with multiple cols
281
+ and re.search(r"\blimit\s+\d+", sql_lower)
282
+ and not re.search(r"\b(?:rank|row_number|dense_rank)\s*\(", sql_lower)
283
+ and re.search(r"\bsum\s*\(", sql_lower) # aggregation present
284
+ ):
285
+ issues.append({
286
+ "pattern_name": "dual_metric_limit_not_dual_rank",
287
+ "description": (
288
+ "POSSIBLE BUG β€” ORDER BY metricA, metricB LIMIT N is NOT two independent "
289
+ "rankings. metricB is only a tiebreaker; the LIMIT picks top-N by metricA. "
290
+ "If the question asks for items that rank in the top N for BOTH metrics "
291
+ "independently, you must use two separate RANK() window functions."
292
+ ),
293
+ "correction": (
294
+ "Use two independent RANK() windows and filter where both ranks <= N:\n"
295
+ "\n"
296
+ "SELECT * FROM (\n"
297
+ " SELECT product_id,\n"
298
+ " SUM(metric_a) AS metric_a,\n"
299
+ " SUM(metric_b) AS metric_b,\n"
300
+ " RANK() OVER (ORDER BY SUM(metric_a) DESC) AS rank_a,\n"
301
+ " RANK() OVER (ORDER BY SUM(metric_b) DESC) AS rank_b\n"
302
+ " FROM ...\n"
303
+ " GROUP BY product_id\n"
304
+ ") t\n"
305
+ "WHERE rank_a <= N AND rank_b <= N"
306
+ ),
307
+ })
308
+
309
  # ── Pattern 3a ───────────────────────────────────────────────────────────
310
  # WHERE status filter alongside CASE WHEN status β€” wrong denominator.
311
  # When computing "percentage of X vs Y", the WHERE clause must NOT pre-filter