jashdoshi77 commited on
Commit
67611e2
Β·
1 Parent(s): 2650443

hopefully the final

Browse files
Files changed (2) hide show
  1. ai/signatures.py +38 -0
  2. ai/sql_pattern_checker.py +53 -0
ai/signatures.py CHANGED
@@ -160,6 +160,36 @@ class AnalyzeAndPlan(dspy.Signature):
160
  There is no product_master, products, or product_catalog table.
161
  Use product_id as the only product identifier. Never invent table names.
162
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  ══════════════════════════════════════════════════════════════
164
  RULE 1C0 β€” "TOP/BEST PER GROUP" REQUIRES ROW_NUMBER PARTITION BY
165
  ══════════════════════════════════════════════════════════════
@@ -623,6 +653,14 @@ class SQLGeneration(dspy.Signature):
623
 
624
  4d. NO product_master table β€” never reference it; use product_id only.
625
 
 
 
 
 
 
 
 
 
626
  4c0. "TOP/BEST PER GROUP" β†’ use ROW_NUMBER() PARTITION BY the group column, filter rnk = 1.
627
  WRONG: GROUP BY city, customer ORDER BY revenue DESC (global sort, not per-city top)
628
  CORRECT: ROW_NUMBER() OVER (PARTITION BY city ORDER BY revenue DESC) AS rnk … WHERE rnk = 1
 
160
  There is no product_master, products, or product_catalog table.
161
  Use product_id as the only product identifier. Never invent table names.
162
 
163
+ ══════════════════════════════════════════════════════════════
164
+ RULE 1B2 β€” SELECT DISTINCT WHEN JOINING HEADER TO LINE TABLES
165
+ ══════════════════════════════════════════════════════════════
166
+ When selecting a header-level ID (so_id, po_id, customer_id) after
167
+ joining to line-level tables (sales_order_line, sales_order_line_pricing,
168
+ etc.), one header row can match MANY line rows.
169
+ Without DISTINCT, the same header ID appears once per matching line β†’ duplicates.
170
+
171
+ WRONG (11,111 rows with duplicate so_ids):
172
+ SELECT so.so_id
173
+ FROM sales_table_v2_sales_order so
174
+ JOIN sales_table_v2_sales_order_line sol ON so.so_id = sol.so_id
175
+ JOIN sales_table_v2_sales_order_line_pricing lp ON sol.sol_id = lp.sol_id
176
+ WHERE lp.making_charges_per_unit > lp.diamond_amount_per_unit
177
+
178
+ CORRECT (8,079 unique orders):
179
+ SELECT DISTINCT so.so_id
180
+ FROM sales_table_v2_sales_order so
181
+ JOIN sales_table_v2_sales_order_line sol ON so.so_id = sol.so_id
182
+ JOIN sales_table_v2_sales_order_line_pricing lp ON sol.sol_id = lp.sol_id
183
+ WHERE lp.making_charges_per_unit > lp.diamond_amount_per_unit
184
+
185
+ Rule: if the SELECT list contains only header-level IDs/names (no aggregation,
186
+ no line-level columns) AND the query joins to line tables β†’ always add DISTINCT.
187
+
188
+ ALSO: when comparing per-unit values against each other, do NOT multiply both
189
+ sides by quantity β€” it cancels out and adds noise.
190
+ REDUNDANT: making_charges_per_unit * quantity > diamond_amount_per_unit * quantity
191
+ SIMPLIFIED: making_charges_per_unit > diamond_amount_per_unit
192
+
193
  ══════════════════════════════════════════════════════════════
194
  RULE 1C0 β€” "TOP/BEST PER GROUP" REQUIRES ROW_NUMBER PARTITION BY
195
  ══════════════════════════════════════════════════════════════
 
653
 
654
  4d. NO product_master table β€” never reference it; use product_id only.
655
 
656
+ 4b2. SELECT header ID after joining line tables β†’ always use SELECT DISTINCT:
657
+ Joining sales_order β†’ sales_order_line β†’ pricing produces one row per line item.
658
+ Without DISTINCT, same so_id appears N times (once per matching line) β†’ duplicates.
659
+ WRONG: SELECT so.so_id FROM sales_order so JOIN sales_order_line sol ...
660
+ CORRECT: SELECT DISTINCT so.so_id FROM sales_order so JOIN sales_order_line sol ...
661
+ Also: comparing per-unit values against each other β€” never multiply both sides by
662
+ quantity (it cancels). Use: making_charges_per_unit > diamond_amount_per_unit
663
+
664
  4c0. "TOP/BEST PER GROUP" β†’ use ROW_NUMBER() PARTITION BY the group column, filter rnk = 1.
665
  WRONG: GROUP BY city, customer ORDER BY revenue DESC (global sort, not per-city top)
666
  CORRECT: ROW_NUMBER() OVER (PARTITION BY city ORDER BY revenue DESC) AS rnk … WHERE rnk = 1
ai/sql_pattern_checker.py CHANGED
@@ -232,6 +232,59 @@ def check_sql_patterns(sql: str) -> list[dict[str, Any]]:
232
  ),
233
  })
234
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  # ── Pattern 1b ───────────────────────────────────────────────────────────
236
  # "Top X per group" answered as a global sort instead of PARTITION BY ranking.
237
  #
 
232
  ),
233
  })
234
 
235
+ # ── Pattern 1a ───────────────────────────────────────────────────────────
236
+ # Missing DISTINCT when selecting a header ID after joining to line tables.
237
+ # One header (so_id / po_id) matches many line rows β†’ same ID repeated per line.
238
+ # Detectable: SELECT has a header ID, JOINs include line tables, no DISTINCT,
239
+ # no aggregation (COUNT/SUM/AVG/etc.) in the SELECT list.
240
+ SALES_LINE_TABLES_SET = {
241
+ "sales_table_v2_sales_order_line",
242
+ "sales_table_v2_sales_order_line_pricing",
243
+ "sales_table_v2_sales_order_line_gold",
244
+ "sales_table_v2_sales_order_line_diamond",
245
+ "purchase_orders_v6_po_line_items",
246
+ "purchase_orders_v6_po_line_pricing",
247
+ "purchase_orders_v6_po_line_diamond",
248
+ "purchase_orders_v6_po_line_gold",
249
+ }
250
+ HEADER_IDS = {"so_id", "po_id", "sol_id", "pol_id"}
251
+
252
+ tables_referenced = {t.lower() for t in re.findall(r'\b(\w+)\b', sql_lower)}
253
+ joins_line_table = bool(SALES_LINE_TABLES_SET & tables_referenced)
254
+
255
+ if joins_line_table:
256
+ # Extract SELECT list (between SELECT and FROM)
257
+ select_match = re.search(r'\bselect\b(.*?)\bfrom\b', sql_lower, re.DOTALL)
258
+ if select_match:
259
+ select_list = select_match.group(1).strip()
260
+ has_distinct = select_list.startswith("distinct")
261
+ has_aggregation = bool(re.search(r'\b(sum|count|avg|min|max)\s*\(', select_list))
262
+ # Check if only header IDs (and maybe names) are selected
263
+ selected_cols = {c.strip().split('.')[-1].split(' ')[0]
264
+ for c in select_list.split(',')}
265
+ selects_only_header_id = bool(HEADER_IDS & selected_cols) and not has_aggregation
266
+
267
+ if selects_only_header_id and not has_distinct:
268
+ issues.append({
269
+ "pattern_name": "missing_distinct_header_id_with_line_join",
270
+ "description": (
271
+ "DUPLICATE ROWS β€” selecting a header ID (so_id/po_id) after joining "
272
+ "to line-level tables without DISTINCT. One order can have many line "
273
+ "items; without DISTINCT the same so_id appears once per matching "
274
+ "line, inflating row count (e.g. 11,111 rows instead of 8,079 orders)."
275
+ ),
276
+ "correction": (
277
+ "Add DISTINCT immediately after SELECT:\n"
278
+ " WRONG: SELECT so.so_id FROM ... JOIN sales_order_line ...\n"
279
+ " CORRECT: SELECT DISTINCT so.so_id FROM ... JOIN sales_order_line ...\n"
280
+ "\n"
281
+ "Also: when comparing per-unit columns against each other in WHERE, "
282
+ "do not multiply both sides by quantity β€” it cancels out:\n"
283
+ " REDUNDANT: making_charges_per_unit * quantity > diamond_amount_per_unit * quantity\n"
284
+ " SIMPLIFIED: making_charges_per_unit > diamond_amount_per_unit"
285
+ ),
286
+ })
287
+
288
  # ── Pattern 1b ───────────────────────────────────────────────────────────
289
  # "Top X per group" answered as a global sort instead of PARTITION BY ranking.
290
  #