jashdoshi77 commited on
Commit
057135b
Β·
1 Parent(s): 4791304

sql 'per' pattern checker

Browse files
Files changed (2) hide show
  1. ai/signatures.py +31 -0
  2. ai/sql_pattern_checker.py +25 -0
ai/signatures.py CHANGED
@@ -160,6 +160,29 @@ class AnalyzeAndPlan(dspy.Signature):
160
  There is no product_master, products, or product_catalog table.
161
  Use product_id as the only product identifier. Never invent table names.
162
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  ══════════════════════════════════════════════════════════════
164
  RULE 1E β€” WHICH TABLE OWNS WHICH COLUMNS (DO NOT MIX)
165
  ══════════════════════════════════════════════════════════════
@@ -482,6 +505,14 @@ class SQLGeneration(dspy.Signature):
482
 
483
  4d. NO product_master table β€” never reference it; use product_id only.
484
 
 
 
 
 
 
 
 
 
485
  4e. TABLE COLUMN OWNERSHIP β€” never use a column from the wrong table:
486
  sales_order_line_pricing β†’ has: gold_amount_per_unit, diamond_amount_per_unit,
487
  making_charges_per_unit, line_total, quantity, sol_id, variant_sku, product_id
 
160
  There is no product_master, products, or product_catalog table.
161
  Use product_id as the only product identifier. Never invent table names.
162
 
163
+ ══════════════════════════════════════════════════════════════
164
+ RULE 1D2 β€” "PER X" DENOMINATOR β€” READ THE QUESTION CAREFULLY
165
+ ══════════════════════════════════════════════════════════════
166
+ The word after "per" tells you exactly what the denominator must be.
167
+ Using the wrong denominator gives a completely different metric.
168
+
169
+ "per order" β†’ COUNT(DISTINCT so_id) ← number of sales orders
170
+ "per unit" β†’ SUM(quantity) ← number of pieces/items sold
171
+ "per customer" β†’ COUNT(DISTINCT customer_id)
172
+ "per product" β†’ COUNT(DISTINCT product_id)
173
+ "per vendor" β†’ COUNT(DISTINCT vendor_id)
174
+ "per SKU" β†’ COUNT(DISTINCT variant_sku)
175
+
176
+ WRONG β€” "per order" using quantity as denominator:
177
+ SUM(lp.line_total) / SUM(sol.quantity) ← this is revenue per UNIT, not per ORDER
178
+
179
+ CORRECT β€” "per order" using distinct order count:
180
+ SUM(lp.line_total) / COUNT(DISTINCT so.so_id) ← this is revenue per ORDER
181
+
182
+ Similarly for AOV (average order value):
183
+ AVG(total_amount) or SUM(total_amount) / COUNT(DISTINCT so_id)
184
+ NEVER SUM(total_amount) / SUM(quantity)
185
+
186
  ══════════════════════════════════════════════════════════════
187
  RULE 1E β€” WHICH TABLE OWNS WHICH COLUMNS (DO NOT MIX)
188
  ══════════════════════════════════════════════════════════════
 
505
 
506
  4d. NO product_master table β€” never reference it; use product_id only.
507
 
508
+ 4d2. "PER X" DENOMINATOR β€” use the correct divisor for what "per" refers to:
509
+ "per order" β†’ COUNT(DISTINCT so.so_id) NOT SUM(quantity)
510
+ "per unit" β†’ SUM(quantity) NOT COUNT(DISTINCT so_id)
511
+ "per customer" β†’ COUNT(DISTINCT so.customer_id)
512
+ "per vendor" β†’ COUNT(DISTINCT vendor_id)
513
+ WRONG: SUM(line_total) / SUM(quantity) ← revenue per unit, not per order
514
+ CORRECT: SUM(line_total) / COUNT(DISTINCT so.so_id) ← revenue per order
515
+
516
  4e. TABLE COLUMN OWNERSHIP β€” never use a column from the wrong table:
517
  sales_order_line_pricing β†’ has: gold_amount_per_unit, diamond_amount_per_unit,
518
  making_charges_per_unit, line_total, quantity, sol_id, variant_sku, product_id
ai/sql_pattern_checker.py CHANGED
@@ -232,6 +232,31 @@ def check_sql_patterns(sql: str) -> list[dict[str, Any]]:
232
  ),
233
  })
234
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  # ── Pattern 4 ────────────────────────────────────────────────────────────
236
  # Schema-aware: detect alias.column where column doesn't exist in that table.
237
  # Generic β€” works for gold_kt on pricing table, or any future similar mistake.
 
232
  ),
233
  })
234
 
235
+ # ── Pattern 3b ───────────────────────────────────────────────────────────
236
+ # "per order" metric computed with SUM(quantity) as denominator instead of
237
+ # COUNT(DISTINCT so_id). SUM(quantity) = revenue per unit; "per order"
238
+ # requires COUNT(DISTINCT so_id).
239
+ # Heuristic: division where the denominator contains sum(...quantity...)
240
+ if re.search(r"/\s*sum\s*\([^)]*quantit", sql_lower):
241
+ issues.append({
242
+ "pattern_name": "per_unit_instead_of_per_order",
243
+ "description": (
244
+ "POSSIBLE BUG β€” dividing by SUM(quantity) gives revenue per UNIT (per piece). "
245
+ "If the question asks for 'per order', the denominator must be "
246
+ "COUNT(DISTINCT so_id), not SUM(quantity). "
247
+ "These are completely different metrics: "
248
+ "SUM(line_total)/SUM(quantity) = avg revenue per item sold; "
249
+ "SUM(line_total)/COUNT(DISTINCT so_id) = avg revenue each time product appears in an order."
250
+ ),
251
+ "correction": (
252
+ "Check the question: does it say 'per order' or 'per unit/piece'?\n"
253
+ " 'per order' β†’ SUM(lp.line_total) / COUNT(DISTINCT so.so_id)\n"
254
+ " 'per unit' β†’ SUM(lp.line_total) / SUM(lp.quantity)\n"
255
+ " 'per customer' β†’ SUM(lp.line_total) / COUNT(DISTINCT so.customer_id)\n"
256
+ "If the question says 'per order', rewrite using COUNT(DISTINCT so.so_id)."
257
+ ),
258
+ })
259
+
260
  # ── Pattern 4 ────────────────────────────────────────────────────────────
261
  # Schema-aware: detect alias.column where column doesn't exist in that table.
262
  # Generic β€” works for gold_kt on pricing table, or any future similar mistake.