Spaces:

jashdoshi77
/

sqlbot

Running

App Files Files Community

jashdoshi77 commited on 3 days ago

Commit

67611e2

1 Parent(s): 2650443

hopefully the final

Browse files

Files changed (2) hide show

ai/signatures.py +38 -0
ai/sql_pattern_checker.py +53 -0

ai/signatures.py CHANGED Viewed

@@ -160,6 +160,36 @@ class AnalyzeAndPlan(dspy.Signature):
     There is no product_master, products, or product_catalog table.
     Use product_id as the only product identifier. Never invent table names.
     ══════════════════════════════════════════════════════════════
     RULE 1C0 — "TOP/BEST PER GROUP" REQUIRES ROW_NUMBER PARTITION BY
     ══════════════════════════════════════════════════════════════
@@ -623,6 +653,14 @@ class SQLGeneration(dspy.Signature):
     4d. NO product_master table — never reference it; use product_id only.
     4c0. "TOP/BEST PER GROUP" → use ROW_NUMBER() PARTITION BY the group column, filter rnk = 1.
          WRONG: GROUP BY city, customer ORDER BY revenue DESC (global sort, not per-city top)
          CORRECT: ROW_NUMBER() OVER (PARTITION BY city ORDER BY revenue DESC) AS rnk … WHERE rnk = 1

     There is no product_master, products, or product_catalog table.
     Use product_id as the only product identifier. Never invent table names.
+    ══════════════════════════════════════════════════════════════
+    RULE 1B2 — SELECT DISTINCT WHEN JOINING HEADER TO LINE TABLES
+    ══════════════════════════════════════════════════════════════
+    When selecting a header-level ID (so_id, po_id, customer_id) after
+    joining to line-level tables (sales_order_line, sales_order_line_pricing,
+    etc.), one header row can match MANY line rows.
+    Without DISTINCT, the same header ID appears once per matching line → duplicates.
+    WRONG (11,111 rows with duplicate so_ids):
+      SELECT so.so_id
+      FROM sales_table_v2_sales_order so
+      JOIN sales_table_v2_sales_order_line sol ON so.so_id = sol.so_id
+      JOIN sales_table_v2_sales_order_line_pricing lp ON sol.sol_id = lp.sol_id
+      WHERE lp.making_charges_per_unit > lp.diamond_amount_per_unit
+    CORRECT (8,079 unique orders):
+      SELECT DISTINCT so.so_id
+      FROM sales_table_v2_sales_order so
+      JOIN sales_table_v2_sales_order_line sol ON so.so_id = sol.so_id
+      JOIN sales_table_v2_sales_order_line_pricing lp ON sol.sol_id = lp.sol_id
+      WHERE lp.making_charges_per_unit > lp.diamond_amount_per_unit
+    Rule: if the SELECT list contains only header-level IDs/names (no aggregation,
+    no line-level columns) AND the query joins to line tables → always add DISTINCT.
+    ALSO: when comparing per-unit values against each other, do NOT multiply both
+    sides by quantity — it cancels out and adds noise.
+      REDUNDANT:  making_charges_per_unit * quantity > diamond_amount_per_unit * quantity
+      SIMPLIFIED: making_charges_per_unit > diamond_amount_per_unit
     ══════════════════════════════════════════════════════════════
     RULE 1C0 — "TOP/BEST PER GROUP" REQUIRES ROW_NUMBER PARTITION BY
     ══════════════════════════════════════════════════════════════
     4d. NO product_master table — never reference it; use product_id only.
+    4b2. SELECT header ID after joining line tables → always use SELECT DISTINCT:
+         Joining sales_order → sales_order_line → pricing produces one row per line item.
+         Without DISTINCT, same so_id appears N times (once per matching line) → duplicates.
+         WRONG:   SELECT so.so_id FROM sales_order so JOIN sales_order_line sol ...
+         CORRECT: SELECT DISTINCT so.so_id FROM sales_order so JOIN sales_order_line sol ...
+         Also: comparing per-unit values against each other — never multiply both sides by
+         quantity (it cancels). Use: making_charges_per_unit > diamond_amount_per_unit
     4c0. "TOP/BEST PER GROUP" → use ROW_NUMBER() PARTITION BY the group column, filter rnk = 1.
          WRONG: GROUP BY city, customer ORDER BY revenue DESC (global sort, not per-city top)
          CORRECT: ROW_NUMBER() OVER (PARTITION BY city ORDER BY revenue DESC) AS rnk … WHERE rnk = 1

ai/sql_pattern_checker.py CHANGED Viewed

@@ -232,6 +232,59 @@ def check_sql_patterns(sql: str) -> list[dict[str, Any]]:
                 ),
             })
     # ── Pattern 1b ───────────────────────────────────────────────────────────
     # "Top X per group" answered as a global sort instead of PARTITION BY ranking.
     #

                 ),
             })
+    # ── Pattern 1a ───────────────────────────────────────────────────────────
+    # Missing DISTINCT when selecting a header ID after joining to line tables.
+    # One header (so_id / po_id) matches many line rows → same ID repeated per line.
+    # Detectable: SELECT has a header ID, JOINs include line tables, no DISTINCT,
+    # no aggregation (COUNT/SUM/AVG/etc.) in the SELECT list.
+    SALES_LINE_TABLES_SET = {
+        "sales_table_v2_sales_order_line",
+        "sales_table_v2_sales_order_line_pricing",
+        "sales_table_v2_sales_order_line_gold",
+        "sales_table_v2_sales_order_line_diamond",
+        "purchase_orders_v6_po_line_items",
+        "purchase_orders_v6_po_line_pricing",
+        "purchase_orders_v6_po_line_diamond",
+        "purchase_orders_v6_po_line_gold",
+    }
+    HEADER_IDS = {"so_id", "po_id", "sol_id", "pol_id"}
+    tables_referenced = {t.lower() for t in re.findall(r'\b(\w+)\b', sql_lower)}
+    joins_line_table = bool(SALES_LINE_TABLES_SET & tables_referenced)
+    if joins_line_table:
+        # Extract SELECT list (between SELECT and FROM)
+        select_match = re.search(r'\bselect\b(.*?)\bfrom\b', sql_lower, re.DOTALL)
+        if select_match:
+            select_list = select_match.group(1).strip()
+            has_distinct = select_list.startswith("distinct")
+            has_aggregation = bool(re.search(r'\b(sum|count|avg|min|max)\s*\(', select_list))
+            # Check if only header IDs (and maybe names) are selected
+            selected_cols = {c.strip().split('.')[-1].split(' ')[0]
+                             for c in select_list.split(',')}
+            selects_only_header_id = bool(HEADER_IDS & selected_cols) and not has_aggregation
+            if selects_only_header_id and not has_distinct:
+                issues.append({
+                    "pattern_name": "missing_distinct_header_id_with_line_join",
+                    "description": (
+                        "DUPLICATE ROWS — selecting a header ID (so_id/po_id) after joining "
+                        "to line-level tables without DISTINCT. One order can have many line "
+                        "items; without DISTINCT the same so_id appears once per matching "
+                        "line, inflating row count (e.g. 11,111 rows instead of 8,079 orders)."
+                    ),
+                    "correction": (
+                        "Add DISTINCT immediately after SELECT:\n"
+                        "  WRONG:   SELECT so.so_id FROM ... JOIN sales_order_line ...\n"
+                        "  CORRECT: SELECT DISTINCT so.so_id FROM ... JOIN sales_order_line ...\n"
+                        "\n"
+                        "Also: when comparing per-unit columns against each other in WHERE, "
+                        "do not multiply both sides by quantity — it cancels out:\n"
+                        "  REDUNDANT:  making_charges_per_unit * quantity > diamond_amount_per_unit * quantity\n"
+                        "  SIMPLIFIED: making_charges_per_unit > diamond_amount_per_unit"
+                    ),
+                })
     # ── Pattern 1b ───────────────────────────────────────────────────────────
     # "Top X per group" answered as a global sort instead of PARTITION BY ranking.
     #