Spaces:
Running
Running
Commit Β·
67611e2
1
Parent(s): 2650443
hopefully the final
Browse files- ai/signatures.py +38 -0
- ai/sql_pattern_checker.py +53 -0
ai/signatures.py
CHANGED
|
@@ -160,6 +160,36 @@ class AnalyzeAndPlan(dspy.Signature):
|
|
| 160 |
There is no product_master, products, or product_catalog table.
|
| 161 |
Use product_id as the only product identifier. Never invent table names.
|
| 162 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 164 |
RULE 1C0 β "TOP/BEST PER GROUP" REQUIRES ROW_NUMBER PARTITION BY
|
| 165 |
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -623,6 +653,14 @@ class SQLGeneration(dspy.Signature):
|
|
| 623 |
|
| 624 |
4d. NO product_master table β never reference it; use product_id only.
|
| 625 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 626 |
4c0. "TOP/BEST PER GROUP" β use ROW_NUMBER() PARTITION BY the group column, filter rnk = 1.
|
| 627 |
WRONG: GROUP BY city, customer ORDER BY revenue DESC (global sort, not per-city top)
|
| 628 |
CORRECT: ROW_NUMBER() OVER (PARTITION BY city ORDER BY revenue DESC) AS rnk β¦ WHERE rnk = 1
|
|
|
|
| 160 |
There is no product_master, products, or product_catalog table.
|
| 161 |
Use product_id as the only product identifier. Never invent table names.
|
| 162 |
|
| 163 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 164 |
+
RULE 1B2 β SELECT DISTINCT WHEN JOINING HEADER TO LINE TABLES
|
| 165 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 166 |
+
When selecting a header-level ID (so_id, po_id, customer_id) after
|
| 167 |
+
joining to line-level tables (sales_order_line, sales_order_line_pricing,
|
| 168 |
+
etc.), one header row can match MANY line rows.
|
| 169 |
+
Without DISTINCT, the same header ID appears once per matching line β duplicates.
|
| 170 |
+
|
| 171 |
+
WRONG (11,111 rows with duplicate so_ids):
|
| 172 |
+
SELECT so.so_id
|
| 173 |
+
FROM sales_table_v2_sales_order so
|
| 174 |
+
JOIN sales_table_v2_sales_order_line sol ON so.so_id = sol.so_id
|
| 175 |
+
JOIN sales_table_v2_sales_order_line_pricing lp ON sol.sol_id = lp.sol_id
|
| 176 |
+
WHERE lp.making_charges_per_unit > lp.diamond_amount_per_unit
|
| 177 |
+
|
| 178 |
+
CORRECT (8,079 unique orders):
|
| 179 |
+
SELECT DISTINCT so.so_id
|
| 180 |
+
FROM sales_table_v2_sales_order so
|
| 181 |
+
JOIN sales_table_v2_sales_order_line sol ON so.so_id = sol.so_id
|
| 182 |
+
JOIN sales_table_v2_sales_order_line_pricing lp ON sol.sol_id = lp.sol_id
|
| 183 |
+
WHERE lp.making_charges_per_unit > lp.diamond_amount_per_unit
|
| 184 |
+
|
| 185 |
+
Rule: if the SELECT list contains only header-level IDs/names (no aggregation,
|
| 186 |
+
no line-level columns) AND the query joins to line tables β always add DISTINCT.
|
| 187 |
+
|
| 188 |
+
ALSO: when comparing per-unit values against each other, do NOT multiply both
|
| 189 |
+
sides by quantity β it cancels out and adds noise.
|
| 190 |
+
REDUNDANT: making_charges_per_unit * quantity > diamond_amount_per_unit * quantity
|
| 191 |
+
SIMPLIFIED: making_charges_per_unit > diamond_amount_per_unit
|
| 192 |
+
|
| 193 |
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 194 |
RULE 1C0 β "TOP/BEST PER GROUP" REQUIRES ROW_NUMBER PARTITION BY
|
| 195 |
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 653 |
|
| 654 |
4d. NO product_master table β never reference it; use product_id only.
|
| 655 |
|
| 656 |
+
4b2. SELECT header ID after joining line tables β always use SELECT DISTINCT:
|
| 657 |
+
Joining sales_order β sales_order_line β pricing produces one row per line item.
|
| 658 |
+
Without DISTINCT, same so_id appears N times (once per matching line) β duplicates.
|
| 659 |
+
WRONG: SELECT so.so_id FROM sales_order so JOIN sales_order_line sol ...
|
| 660 |
+
CORRECT: SELECT DISTINCT so.so_id FROM sales_order so JOIN sales_order_line sol ...
|
| 661 |
+
Also: comparing per-unit values against each other β never multiply both sides by
|
| 662 |
+
quantity (it cancels). Use: making_charges_per_unit > diamond_amount_per_unit
|
| 663 |
+
|
| 664 |
4c0. "TOP/BEST PER GROUP" β use ROW_NUMBER() PARTITION BY the group column, filter rnk = 1.
|
| 665 |
WRONG: GROUP BY city, customer ORDER BY revenue DESC (global sort, not per-city top)
|
| 666 |
CORRECT: ROW_NUMBER() OVER (PARTITION BY city ORDER BY revenue DESC) AS rnk β¦ WHERE rnk = 1
|
ai/sql_pattern_checker.py
CHANGED
|
@@ -232,6 +232,59 @@ def check_sql_patterns(sql: str) -> list[dict[str, Any]]:
|
|
| 232 |
),
|
| 233 |
})
|
| 234 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
# ββ Pattern 1b βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 236 |
# "Top X per group" answered as a global sort instead of PARTITION BY ranking.
|
| 237 |
#
|
|
|
|
| 232 |
),
|
| 233 |
})
|
| 234 |
|
| 235 |
+
# ββ Pattern 1a βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 236 |
+
# Missing DISTINCT when selecting a header ID after joining to line tables.
|
| 237 |
+
# One header (so_id / po_id) matches many line rows β same ID repeated per line.
|
| 238 |
+
# Detectable: SELECT has a header ID, JOINs include line tables, no DISTINCT,
|
| 239 |
+
# no aggregation (COUNT/SUM/AVG/etc.) in the SELECT list.
|
| 240 |
+
SALES_LINE_TABLES_SET = {
|
| 241 |
+
"sales_table_v2_sales_order_line",
|
| 242 |
+
"sales_table_v2_sales_order_line_pricing",
|
| 243 |
+
"sales_table_v2_sales_order_line_gold",
|
| 244 |
+
"sales_table_v2_sales_order_line_diamond",
|
| 245 |
+
"purchase_orders_v6_po_line_items",
|
| 246 |
+
"purchase_orders_v6_po_line_pricing",
|
| 247 |
+
"purchase_orders_v6_po_line_diamond",
|
| 248 |
+
"purchase_orders_v6_po_line_gold",
|
| 249 |
+
}
|
| 250 |
+
HEADER_IDS = {"so_id", "po_id", "sol_id", "pol_id"}
|
| 251 |
+
|
| 252 |
+
tables_referenced = {t.lower() for t in re.findall(r'\b(\w+)\b', sql_lower)}
|
| 253 |
+
joins_line_table = bool(SALES_LINE_TABLES_SET & tables_referenced)
|
| 254 |
+
|
| 255 |
+
if joins_line_table:
|
| 256 |
+
# Extract SELECT list (between SELECT and FROM)
|
| 257 |
+
select_match = re.search(r'\bselect\b(.*?)\bfrom\b', sql_lower, re.DOTALL)
|
| 258 |
+
if select_match:
|
| 259 |
+
select_list = select_match.group(1).strip()
|
| 260 |
+
has_distinct = select_list.startswith("distinct")
|
| 261 |
+
has_aggregation = bool(re.search(r'\b(sum|count|avg|min|max)\s*\(', select_list))
|
| 262 |
+
# Check if only header IDs (and maybe names) are selected
|
| 263 |
+
selected_cols = {c.strip().split('.')[-1].split(' ')[0]
|
| 264 |
+
for c in select_list.split(',')}
|
| 265 |
+
selects_only_header_id = bool(HEADER_IDS & selected_cols) and not has_aggregation
|
| 266 |
+
|
| 267 |
+
if selects_only_header_id and not has_distinct:
|
| 268 |
+
issues.append({
|
| 269 |
+
"pattern_name": "missing_distinct_header_id_with_line_join",
|
| 270 |
+
"description": (
|
| 271 |
+
"DUPLICATE ROWS β selecting a header ID (so_id/po_id) after joining "
|
| 272 |
+
"to line-level tables without DISTINCT. One order can have many line "
|
| 273 |
+
"items; without DISTINCT the same so_id appears once per matching "
|
| 274 |
+
"line, inflating row count (e.g. 11,111 rows instead of 8,079 orders)."
|
| 275 |
+
),
|
| 276 |
+
"correction": (
|
| 277 |
+
"Add DISTINCT immediately after SELECT:\n"
|
| 278 |
+
" WRONG: SELECT so.so_id FROM ... JOIN sales_order_line ...\n"
|
| 279 |
+
" CORRECT: SELECT DISTINCT so.so_id FROM ... JOIN sales_order_line ...\n"
|
| 280 |
+
"\n"
|
| 281 |
+
"Also: when comparing per-unit columns against each other in WHERE, "
|
| 282 |
+
"do not multiply both sides by quantity β it cancels out:\n"
|
| 283 |
+
" REDUNDANT: making_charges_per_unit * quantity > diamond_amount_per_unit * quantity\n"
|
| 284 |
+
" SIMPLIFIED: making_charges_per_unit > diamond_amount_per_unit"
|
| 285 |
+
),
|
| 286 |
+
})
|
| 287 |
+
|
| 288 |
# ββ Pattern 1b βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 289 |
# "Top X per group" answered as a global sort instead of PARTITION BY ranking.
|
| 290 |
#
|