Spaces:
Running
Running
Commit Β·
c442c5a
1
Parent(s): c36c8e5
added column mismatch detector
Browse files- ai/signatures.py +48 -0
- ai/sql_pattern_checker.py +95 -0
ai/signatures.py
CHANGED
|
@@ -160,6 +160,43 @@ class AnalyzeAndPlan(dspy.Signature):
|
|
| 160 |
There is no product_master, products, or product_catalog table.
|
| 161 |
Use product_id as the only product identifier. Never invent table names.
|
| 162 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 164 |
RULE 1A β FAN-OUT: DEDUPLICATE BEFORE AGGREGATING ON JOIN CHAINS
|
| 165 |
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -436,6 +473,17 @@ class SQLGeneration(dspy.Signature):
|
|
| 436 |
|
| 437 |
4d. NO product_master table β never reference it; use product_id only.
|
| 438 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
5. USE PRE-COMPUTED TOTALS β NEVER RECONSTRUCT THEM:
|
| 440 |
- For order-level metrics (revenue, AOV): use sales_table_v2_sales_order.total_amount
|
| 441 |
- For PO totals: use purchase_orders_v6_purchase_order.total_amount
|
|
|
|
| 160 |
There is no product_master, products, or product_catalog table.
|
| 161 |
Use product_id as the only product identifier. Never invent table names.
|
| 162 |
|
| 163 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 164 |
+
RULE 1E β WHICH TABLE OWNS WHICH COLUMNS (DO NOT MIX)
|
| 165 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 166 |
+
sales_order_line_pricing β financial rollup only:
|
| 167 |
+
gold_amount_per_unit, diamond_amount_per_unit, making_charges_per_unit,
|
| 168 |
+
base_price_per_unit, selling_price_per_unit, line_total, final_amount,
|
| 169 |
+
quantity, sol_id, variant_sku, product_id
|
| 170 |
+
β Does NOT have: gold_kt, gold_colour, gold_rate_per_gm, metal_weight,
|
| 171 |
+
diamond_id, shape, quality, pointer, carats
|
| 172 |
+
|
| 173 |
+
sales_order_line_gold β physical gold attributes:
|
| 174 |
+
gold_kt, gold_colour, gold_rate_per_gm, metal_weight_per_unit,
|
| 175 |
+
finding_per_unit, gross_weight_per_unit, gold_amount_per_unit, sol_id
|
| 176 |
+
β JOIN to pricing on sol_id when you need both gold attributes AND costs.
|
| 177 |
+
|
| 178 |
+
sales_order_line_diamond β physical diamond attributes:
|
| 179 |
+
diamond_id, shape, quality, size_mm, pointer, pieces_per_unit,
|
| 180 |
+
carats_per_unit, rate_per_carat, diamond_amount_per_unit, sol_id
|
| 181 |
+
β JOIN to pricing on sol_id ONLY when the question asks about diamond
|
| 182 |
+
properties (shape, quality, karat, carat) β NOT for cost aggregation.
|
| 183 |
+
|
| 184 |
+
RULE: If the question asks "by karat" / "by gold_kt" / "by colour" etc.,
|
| 185 |
+
you MUST join sales_order_line_gold. You cannot get gold_kt from pricing.
|
| 186 |
+
|
| 187 |
+
Example β total costs by karat type:
|
| 188 |
+
SELECT g.gold_kt,
|
| 189 |
+
SUM(lp.gold_amount_per_unit * lp.quantity) AS total_gold_amount,
|
| 190 |
+
SUM(lp.diamond_amount_per_unit * lp.quantity) AS total_diamond_amount,
|
| 191 |
+
SUM(lp.making_charges_per_unit * lp.quantity) AS total_making_charges
|
| 192 |
+
FROM sales_table_v2_sales_order_line_pricing lp
|
| 193 |
+
JOIN sales_table_v2_sales_order_line_gold g ON lp.sol_id = g.sol_id
|
| 194 |
+
JOIN sales_table_v2_sales_order_line sol ON lp.sol_id = sol.sol_id
|
| 195 |
+
JOIN sales_table_v2_sales_order so ON sol.so_id = so.so_id
|
| 196 |
+
WHERE so.status = 'closed'
|
| 197 |
+
GROUP BY g.gold_kt
|
| 198 |
+
ORDER BY g.gold_kt
|
| 199 |
+
|
| 200 |
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 201 |
RULE 1A β FAN-OUT: DEDUPLICATE BEFORE AGGREGATING ON JOIN CHAINS
|
| 202 |
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 473 |
|
| 474 |
4d. NO product_master table β never reference it; use product_id only.
|
| 475 |
|
| 476 |
+
4e. TABLE COLUMN OWNERSHIP β never use a column from the wrong table:
|
| 477 |
+
sales_order_line_pricing β has: gold_amount_per_unit, diamond_amount_per_unit,
|
| 478 |
+
making_charges_per_unit, line_total, quantity, sol_id, variant_sku, product_id
|
| 479 |
+
β does NOT have: gold_kt, gold_colour, shape, quality, diamond_id
|
| 480 |
+
sales_order_line_gold β has: gold_kt, gold_colour, gold_rate_per_gm,
|
| 481 |
+
metal_weight_per_unit (JOIN on sol_id when grouping/filtering by karat or colour)
|
| 482 |
+
sales_order_line_diamond β has: shape, quality, diamond_id, carats_per_unit
|
| 483 |
+
(JOIN on sol_id only for property filters, never for cost aggregation)
|
| 484 |
+
WRONG: SELECT lp.gold_kt ... FROM sales_order_line_pricing lp
|
| 485 |
+
CORRECT: JOIN sales_order_line_gold g ON lp.sol_id = g.sol_id, then use g.gold_kt
|
| 486 |
+
|
| 487 |
5. USE PRE-COMPUTED TOTALS β NEVER RECONSTRUCT THEM:
|
| 488 |
- For order-level metrics (revenue, AOV): use sales_table_v2_sales_order.total_amount
|
| 489 |
- For PO totals: use purchase_orders_v6_purchase_order.total_amount
|
ai/sql_pattern_checker.py
CHANGED
|
@@ -9,6 +9,96 @@ import re
|
|
| 9 |
from typing import Any
|
| 10 |
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
def check_sql_patterns(sql: str) -> list[dict[str, Any]]:
|
| 13 |
"""Detect known bad patterns in a generated SQL string.
|
| 14 |
|
|
@@ -142,6 +232,11 @@ def check_sql_patterns(sql: str) -> list[dict[str, Any]]:
|
|
| 142 |
),
|
| 143 |
})
|
| 144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
return issues
|
| 146 |
|
| 147 |
|
|
|
|
| 9 |
from typing import Any
|
| 10 |
|
| 11 |
|
| 12 |
+
def _build_alias_map(sql: str) -> dict[str, str]:
|
| 13 |
+
"""Extract alias β full_table_name mapping from FROM / JOIN clauses.
|
| 14 |
+
|
| 15 |
+
Handles: FROM table_name alias
|
| 16 |
+
FROM table_name AS alias
|
| 17 |
+
JOIN table_name alias
|
| 18 |
+
JOIN table_name AS alias
|
| 19 |
+
Returns lower-cased keys and values.
|
| 20 |
+
"""
|
| 21 |
+
alias_map: dict[str, str] = {}
|
| 22 |
+
pattern = re.compile(
|
| 23 |
+
r'(?:FROM|JOIN)\s+"?(\w+)"?\s+(?:AS\s+)?"?(\w+)"?',
|
| 24 |
+
re.IGNORECASE,
|
| 25 |
+
)
|
| 26 |
+
for table, alias in pattern.findall(sql):
|
| 27 |
+
alias_map[alias.lower()] = table.lower()
|
| 28 |
+
# also map table β table in case no alias is used
|
| 29 |
+
alias_map[table.lower()] = table.lower()
|
| 30 |
+
return alias_map
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def check_column_table_mismatches(sql: str) -> list[dict[str, Any]]:
|
| 34 |
+
"""Schema-aware check: detect alias.column references where the column
|
| 35 |
+
does not exist in the aliased table.
|
| 36 |
+
|
| 37 |
+
Uses the live database schema so it works for ANY table/column β nothing
|
| 38 |
+
is hardcoded. Returns issue dicts in the same format as check_sql_patterns.
|
| 39 |
+
"""
|
| 40 |
+
try:
|
| 41 |
+
from db.schema import get_schema
|
| 42 |
+
schema = get_schema()
|
| 43 |
+
except Exception:
|
| 44 |
+
return [] # schema unavailable, skip check
|
| 45 |
+
|
| 46 |
+
# Build {table_name_lower: {col_lower, ...}}
|
| 47 |
+
table_cols: dict[str, set[str]] = {
|
| 48 |
+
t.lower(): {c["column_name"].lower() for c in cols}
|
| 49 |
+
for t, cols in schema.items()
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
alias_map = _build_alias_map(sql)
|
| 53 |
+
issues: list[dict[str, Any]] = []
|
| 54 |
+
seen: set[str] = set()
|
| 55 |
+
|
| 56 |
+
# Find all alias.column references in the SQL
|
| 57 |
+
for alias, col in re.findall(r'\b(\w+)\.(\w+)\b', sql):
|
| 58 |
+
alias_l = alias.lower()
|
| 59 |
+
col_l = col.lower()
|
| 60 |
+
key = f"{alias_l}.{col_l}"
|
| 61 |
+
if key in seen:
|
| 62 |
+
continue
|
| 63 |
+
seen.add(key)
|
| 64 |
+
|
| 65 |
+
table_l = alias_map.get(alias_l)
|
| 66 |
+
if table_l is None:
|
| 67 |
+
continue # unknown alias (subquery alias, CTE name, etc.) β skip
|
| 68 |
+
if table_l not in table_cols:
|
| 69 |
+
continue # table not in schema β already caught by schema validator
|
| 70 |
+
|
| 71 |
+
if col_l not in table_cols[table_l]:
|
| 72 |
+
# Find which tables DO have this column
|
| 73 |
+
tables_with_col = [
|
| 74 |
+
t for t, cols in table_cols.items() if col_l in cols
|
| 75 |
+
]
|
| 76 |
+
# Build a helpful correction hint
|
| 77 |
+
if tables_with_col:
|
| 78 |
+
hint = (
|
| 79 |
+
f"Column '{col}' does NOT exist in '{table_l}'. "
|
| 80 |
+
f"It is available in: {', '.join(tables_with_col)}. "
|
| 81 |
+
f"JOIN the correct table on sol_id / so_id / po_id as appropriate "
|
| 82 |
+
f"and reference that table's alias instead."
|
| 83 |
+
)
|
| 84 |
+
else:
|
| 85 |
+
hint = (
|
| 86 |
+
f"Column '{col}' does NOT exist in '{table_l}' "
|
| 87 |
+
f"or any other table in the schema. "
|
| 88 |
+
f"Remove it or use a column that actually exists."
|
| 89 |
+
)
|
| 90 |
+
issues.append({
|
| 91 |
+
"pattern_name": f"wrong_table_for_{col_l}",
|
| 92 |
+
"description": (
|
| 93 |
+
f"CRITICAL BUG β column '{col}' referenced via alias '{alias}' "
|
| 94 |
+
f"which maps to table '{table_l}', but that column does not exist there."
|
| 95 |
+
),
|
| 96 |
+
"correction": hint,
|
| 97 |
+
})
|
| 98 |
+
|
| 99 |
+
return issues
|
| 100 |
+
|
| 101 |
+
|
| 102 |
def check_sql_patterns(sql: str) -> list[dict[str, Any]]:
|
| 103 |
"""Detect known bad patterns in a generated SQL string.
|
| 104 |
|
|
|
|
| 232 |
),
|
| 233 |
})
|
| 234 |
|
| 235 |
+
# ββ Pattern 4 ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 236 |
+
# Schema-aware: detect alias.column where column doesn't exist in that table.
|
| 237 |
+
# Generic β works for gold_kt on pricing table, or any future similar mistake.
|
| 238 |
+
issues.extend(check_column_table_mismatches(sql))
|
| 239 |
+
|
| 240 |
return issues
|
| 241 |
|
| 242 |
|