jashdoshi77 commited on
Commit
c36c8e5
Β·
1 Parent(s): 09b342a

fan out esting 2

Browse files
Files changed (3) hide show
  1. ai/pipeline.py +16 -0
  2. ai/signatures.py +97 -0
  3. ai/sql_pattern_checker.py +157 -0
ai/pipeline.py CHANGED
@@ -23,6 +23,7 @@ from ai.signatures import (
23
  InterpretAndInsight,
24
  )
25
  from ai.validator import validate_sql, check_sql_against_schema
 
26
  from db.schema import format_schema
27
  from db.relationships import format_relationships
28
  from db.profiler import get_data_profile
@@ -114,6 +115,21 @@ class SQLAnalystPipeline:
114
  )
115
  sql = self._clean_sql(sql_result.sql_query)
116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  # 4. Safety validation (no LLM call)
118
  is_safe, reason = validate_sql(sql)
119
  if not is_safe:
 
23
  InterpretAndInsight,
24
  )
25
  from ai.validator import validate_sql, check_sql_against_schema
26
+ from ai.sql_pattern_checker import check_sql_patterns, format_issues_for_repair
27
  from db.schema import format_schema
28
  from db.relationships import format_relationships
29
  from db.profiler import get_data_profile
 
115
  )
116
  sql = self._clean_sql(sql_result.sql_query)
117
 
118
+ # 3.5 Pattern check β€” detect known structural bad patterns, force targeted repair
119
+ pattern_issues = check_sql_patterns(sql)
120
+ if pattern_issues:
121
+ logger.warning(
122
+ "Pattern issues detected: %s",
123
+ [i["pattern_name"] for i in pattern_issues],
124
+ )
125
+ repair_instruction = format_issues_for_repair(pattern_issues)
126
+ sql_result = self.generate_sql(
127
+ question=question_with_date,
128
+ schema_info=schema_str,
129
+ query_plan=plan_text + "\n\n" + repair_instruction,
130
+ )
131
+ sql = self._clean_sql(sql_result.sql_query)
132
+
133
  # 4. Safety validation (no LLM call)
134
  is_safe, reason = validate_sql(sql)
135
  if not is_safe:
ai/signatures.py CHANGED
@@ -87,6 +87,79 @@ class AnalyzeAndPlan(dspy.Signature):
87
  β†’ For: "total amount of PO123", "PO value", "purchase order cost".
88
  β†’ NEVER sum gold_amount + diamond_amount from PO line tables β€” that misses labour.
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  ══════════════════════════════════════════════════════════════
91
  RULE 1A β€” FAN-OUT: DEDUPLICATE BEFORE AGGREGATING ON JOIN CHAINS
92
  ════════════════════════════════════════════════════��═════════
@@ -339,6 +412,30 @@ class SQLGeneration(dspy.Signature):
339
  Order-level: sales_table_v2_sales_order.total_amount
340
  Line-level: sales_table_v2_sales_order_line_pricing.line_total
341
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
  5. USE PRE-COMPUTED TOTALS β€” NEVER RECONSTRUCT THEM:
343
  - For order-level metrics (revenue, AOV): use sales_table_v2_sales_order.total_amount
344
  - For PO totals: use purchase_orders_v6_purchase_order.total_amount
 
87
  β†’ For: "total amount of PO123", "PO value", "purchase order cost".
88
  β†’ NEVER sum gold_amount + diamond_amount from PO line tables β€” that misses labour.
89
 
90
+ ══════════════════════════════════════════════════════════════
91
+ RULE 1A β€” FAN-OUT: purchase_order + po_sales_order_link
92
+ ══════════════════════════════════════════════════════════════
93
+ po_sales_order_link has MULTIPLE rows per po_id (one per linked SO).
94
+ Joining purchase_order β†’ po_sales_order_link and doing SUM(total_amount)
95
+ counts each PO's amount once per linked SO β€” completely wrong.
96
+
97
+ WRONG (never write this):
98
+ SELECT po.vendor_id, SUM(po.total_amount)
99
+ FROM purchase_orders_v6_purchase_order po
100
+ JOIN purchase_orders_v6_po_sales_order_link pl ON po.po_id = pl.po_id
101
+ JOIN sales_table_v2_sales_order so ON pl.so_id = so.so_id
102
+ WHERE so.status = 'closed'
103
+ GROUP BY po.vendor_id
104
+
105
+ CORRECT (DISTINCT subquery first, then SUM):
106
+ SELECT vendor_id, SUM(total_amount) AS total_value
107
+ FROM (
108
+ SELECT DISTINCT po.po_id, po.vendor_id, po.total_amount
109
+ FROM purchase_orders_v6_purchase_order po
110
+ JOIN purchase_orders_v6_po_sales_order_link pl ON po.po_id = pl.po_id
111
+ JOIN sales_table_v2_sales_order so ON pl.so_id = so.so_id
112
+ WHERE so.status = 'closed'
113
+ ) deduped
114
+ GROUP BY vendor_id
115
+ ORDER BY total_value DESC
116
+
117
+ ══════════════════════════════════════════════════════════════
118
+ RULE 1B β€” LAG/LEAD must ORDER BY YEAR then MONTH
119
+ ══════════════════════════════════════════════════════════════
120
+ Data spans multiple years (2024–2026). ORDER BY month alone inside a window
121
+ function compares months across different years β€” wrong growth rates.
122
+
123
+ WRONG: LAG(revenue) OVER (ORDER BY EXTRACT(MONTH FROM order_date::date))
124
+ CORRECT: LAG(revenue) OVER (ORDER BY yr ASC, mo ASC)
125
+
126
+ Also: do NOT add WHERE status = 'closed' unless the question asks for it.
127
+ For growth/trend questions, use all orders (no status filter).
128
+
129
+ CORRECT MoM template:
130
+ WITH monthly AS (
131
+ SELECT EXTRACT(YEAR FROM order_date::date) AS yr,
132
+ EXTRACT(MONTH FROM order_date::date) AS mo,
133
+ SUM(total_amount) AS revenue
134
+ FROM sales_table_v2_sales_order
135
+ GROUP BY yr, mo
136
+ )
137
+ SELECT yr, mo, revenue,
138
+ LAG(revenue) OVER (ORDER BY yr ASC, mo ASC) AS prev_revenue
139
+ FROM monthly ORDER BY yr, mo
140
+
141
+ ══════════════════════════════════════════════════════════════
142
+ RULE 1C β€” IGI/NC certification is in variant_sku, NOT quality
143
+ ══════════════════════════════════════════════════════════════
144
+ The quality column in diamond tables holds grades like 'GH VVS' β€” never 'IGI'/'NC'.
145
+ Filtering quality = 'IGI' always returns zero rows.
146
+
147
+ WRONG: WHERE T3.quality IN ('IGI', 'Non-IGI')
148
+ CORRECT: WHERE variant_sku LIKE '%-IGI' (or '%-NC' for non-certified)
149
+
150
+ For "customers with both IGI and NC in same order":
151
+ WHERE so.so_id IN (
152
+ SELECT so_id FROM sales_table_v2_sales_order_line WHERE variant_sku LIKE '%-IGI'
153
+ INTERSECT
154
+ SELECT so_id FROM sales_table_v2_sales_order_line WHERE variant_sku LIKE '%-NC'
155
+ )
156
+
157
+ ══════════════════════════════════════════════════════════════
158
+ RULE 1D β€” NO product_master TABLE EXISTS
159
+ ══════════════════════════════════════════════════════════════
160
+ There is no product_master, products, or product_catalog table.
161
+ Use product_id as the only product identifier. Never invent table names.
162
+
163
  ══════════════════════════════════════════════════════════════
164
  RULE 1A β€” FAN-OUT: DEDUPLICATE BEFORE AGGREGATING ON JOIN CHAINS
165
  ════════════════════════════════════════════════════��═════════
 
412
  Order-level: sales_table_v2_sales_order.total_amount
413
  Line-level: sales_table_v2_sales_order_line_pricing.line_total
414
 
415
+ 4. FAN-OUT β€” when joining purchase_order to po_sales_order_link, use DISTINCT subquery:
416
+ WRONG: SELECT po.vendor_id, SUM(po.total_amount) FROM purchase_order po
417
+ JOIN po_sales_order_link pl ON po.po_id = pl.po_id ... GROUP BY po.vendor_id
418
+ CORRECT: SELECT vendor_id, SUM(total_amount) FROM (
419
+ SELECT DISTINCT po.po_id, po.vendor_id, po.total_amount
420
+ FROM purchase_orders_v6_purchase_order po
421
+ JOIN purchase_orders_v6_po_sales_order_link pl ON po.po_id = pl.po_id
422
+ JOIN sales_table_v2_sales_order so ON pl.so_id = so.so_id
423
+ WHERE so.status = 'closed'
424
+ ) deduped GROUP BY vendor_id
425
+
426
+ 4b. LAG/LEAD window functions β€” always ORDER BY yr ASC, mo ASC (never month only):
427
+ Also: do NOT add status = 'closed' for trend/growth queries unless explicitly asked.
428
+ WRONG: LAG(x) OVER (ORDER BY EXTRACT(MONTH FROM ...))
429
+ CORRECT: LAG(x) OVER (ORDER BY EXTRACT(YEAR FROM order_date::date) ASC,
430
+ EXTRACT(MONTH FROM order_date::date) ASC)
431
+
432
+ 4c. IGI/NC certification β€” from variant_sku LIKE '%-IGI' / '%-NC', NOT from quality column:
433
+ WRONG: WHERE quality IN ('IGI', 'Non-IGI')
434
+ CORRECT: WHERE variant_sku LIKE '%-IGI'
435
+ For both in same order: use INTERSECT on sales_order_line.
436
+
437
+ 4d. NO product_master table β€” never reference it; use product_id only.
438
+
439
  5. USE PRE-COMPUTED TOTALS β€” NEVER RECONSTRUCT THEM:
440
  - For order-level metrics (revenue, AOV): use sales_table_v2_sales_order.total_amount
441
  - For PO totals: use purchase_orders_v6_purchase_order.total_amount
ai/sql_pattern_checker.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Programmatic SQL pattern checker.
2
+
3
+ Detects known bad SQL patterns that LLMs generate incorrectly even when
4
+ instructed otherwise. Each detector returns a structured issue dict so the
5
+ pipeline can trigger a targeted LLM repair with a precise explanation.
6
+ """
7
+
8
+ import re
9
+ from typing import Any
10
+
11
+
12
+ def check_sql_patterns(sql: str) -> list[dict[str, Any]]:
13
+ """Detect known bad patterns in a generated SQL string.
14
+
15
+ Returns a list of issue dicts:
16
+ {
17
+ "pattern_name": str, # short id
18
+ "description": str, # what is wrong and why
19
+ "correction": str, # exact fix to apply
20
+ }
21
+ An empty list means no issues found.
22
+ """
23
+ issues: list[dict[str, Any]] = []
24
+ sql_lower = sql.lower()
25
+
26
+ # ── Pattern 1 ────────────────────────────────────────────────────────────
27
+ # Fan-out: purchase_order JOINed to po_sales_order_link with SUM but no
28
+ # DISTINCT subquery β†’ total_amount counted once per linked SO.
29
+ if (
30
+ "po_sales_order_link" in sql_lower
31
+ and "purchase_order" in sql_lower
32
+ and re.search(r"\bsum\s*\(", sql_lower)
33
+ and "distinct" not in sql_lower
34
+ ):
35
+ issues.append({
36
+ "pattern_name": "fanout_po_link",
37
+ "description": (
38
+ "CRITICAL BUG β€” fan-out on po_sales_order_link: "
39
+ "purchase_orders_v6_po_sales_order_link has MULTIPLE rows per po_id "
40
+ "(one per linked sales order). Joining purchase_order to this table "
41
+ "and then doing SUM(total_amount) counts the same PO amount 2-3 times, "
42
+ "producing an inflated result (e.g. β‚Ή4,239 Cr instead of β‚Ή1,580 Cr)."
43
+ ),
44
+ "correction": (
45
+ "Wrap purchase_order in a DISTINCT subquery FIRST, then aggregate outside:\n"
46
+ "\n"
47
+ "CORRECT pattern:\n"
48
+ "SELECT vendor_id, SUM(total_amount) AS total_value\n"
49
+ "FROM (\n"
50
+ " SELECT DISTINCT po.po_id, po.vendor_id, po.total_amount\n"
51
+ " FROM purchase_orders_v6_purchase_order po\n"
52
+ " JOIN purchase_orders_v6_po_sales_order_link pl ON po.po_id = pl.po_id\n"
53
+ " JOIN sales_table_v2_sales_order so ON pl.so_id = so.so_id\n"
54
+ " WHERE so.status = 'closed'\n"
55
+ ") deduped\n"
56
+ "GROUP BY vendor_id\n"
57
+ "ORDER BY total_value DESC\n"
58
+ "\n"
59
+ "NEVER do: SUM(po.total_amount) directly after joining po_sales_order_link."
60
+ ),
61
+ })
62
+
63
+ # ── Pattern 2 ────────────────────────────────────────────────────────────
64
+ # LAG/LEAD window function with ORDER BY that includes month but not year.
65
+ # The data spans multiple years β€” ordering by month alone compares months
66
+ # across different years incorrectly.
67
+ if re.search(r"\blag\s*\(|\blead\s*\(", sql_lower):
68
+ # Find all OVER (...) clauses
69
+ over_blocks = re.findall(
70
+ r"over\s*\(([^)]*order\s+by[^)]*)\)", sql, re.IGNORECASE
71
+ )
72
+ for block in over_blocks:
73
+ block_lower = block.lower()
74
+ has_month = bool(re.search(r"\bmonth\b", block_lower))
75
+ has_year = bool(re.search(r"\byear\b", block_lower))
76
+ if has_month and not has_year:
77
+ issues.append({
78
+ "pattern_name": "lag_month_only_order",
79
+ "description": (
80
+ "CRITICAL BUG β€” LAG/LEAD window function orders by MONTH only. "
81
+ "The sales data spans multiple years (2024–2026). Ordering by month "
82
+ "alone makes the window function compare months across different years "
83
+ "(e.g. December 2024 followed by January 2024 instead of January 2025), "
84
+ "producing incorrect growth rates."
85
+ ),
86
+ "correction": (
87
+ "Always ORDER BY YEAR first, then MONTH inside window functions:\n"
88
+ "\n"
89
+ "CORRECT pattern:\n"
90
+ "WITH monthly AS (\n"
91
+ " SELECT\n"
92
+ " EXTRACT(YEAR FROM order_date::date) AS yr,\n"
93
+ " EXTRACT(MONTH FROM order_date::date) AS mo,\n"
94
+ " SUM(total_amount) AS revenue\n"
95
+ " FROM sales_table_v2_sales_order\n"
96
+ " GROUP BY yr, mo\n"
97
+ ")\n"
98
+ "SELECT yr, mo, revenue,\n"
99
+ " LAG(revenue) OVER (ORDER BY yr ASC, mo ASC) AS prev_revenue\n"
100
+ "FROM monthly\n"
101
+ "ORDER BY yr ASC, mo ASC\n"
102
+ "\n"
103
+ "Also GROUP BY yr, mo β€” never just mo."
104
+ ),
105
+ })
106
+ break # one report is enough
107
+
108
+ # ── Pattern 3 ────────────────────────────────────────────────────────────
109
+ # IGI/NC read from the 'quality' column of diamond tables.
110
+ # The quality column holds diamond grades (e.g. 'GH VVS'), never 'IGI'/'NC'.
111
+ # Certification is always in the last segment of variant_sku.
112
+ if re.search(r"\bquality\b", sql_lower):
113
+ # Check if IGI or NC appear as filter values near the quality column
114
+ if re.search(
115
+ r"quality\s*(=|in\s*\(|like)\s*['\"]?\s*(igi|nc|non.?igi|non.?certified)",
116
+ sql_lower,
117
+ ):
118
+ issues.append({
119
+ "pattern_name": "igi_nc_from_quality_column",
120
+ "description": (
121
+ "CRITICAL BUG β€” IGI/NC filtered from the quality column. "
122
+ "The quality column in diamond tables contains diamond grades "
123
+ "like 'GH VVS', 'EF VVS-VS' β€” the values 'IGI' and 'NC' do NOT "
124
+ "exist there, so this filter always returns zero rows."
125
+ ),
126
+ "correction": (
127
+ "Read certification from the LAST segment of variant_sku:\n"
128
+ " IGI certified β†’ variant_sku LIKE '%-IGI'\n"
129
+ " Non-certified β†’ variant_sku LIKE '%-NC'\n"
130
+ "\n"
131
+ "Apply on sales_table_v2_sales_order_line or sales_order_line_pricing.\n"
132
+ "\n"
133
+ "CORRECT pattern (customers with both IGI and NC in same order):\n"
134
+ "SELECT customer_id FROM sales_table_v2_sales_order so\n"
135
+ "WHERE so.so_id IN (\n"
136
+ " SELECT so_id FROM sales_table_v2_sales_order_line\n"
137
+ " WHERE variant_sku LIKE '%-IGI'\n"
138
+ " INTERSECT\n"
139
+ " SELECT so_id FROM sales_table_v2_sales_order_line\n"
140
+ " WHERE variant_sku LIKE '%-NC'\n"
141
+ ")"
142
+ ),
143
+ })
144
+
145
+ return issues
146
+
147
+
148
+ def format_issues_for_repair(issues: list[dict[str, Any]]) -> str:
149
+ """Format detected issues into a clear repair instruction for the LLM."""
150
+ lines = [
151
+ "YOUR GENERATED SQL HAS THE FOLLOWING CRITICAL BUGS β€” REWRITE TO FIX ALL OF THEM:\n"
152
+ ]
153
+ for i, issue in enumerate(issues, 1):
154
+ lines.append(f"BUG {i}: {issue['description']}")
155
+ lines.append(f"FIX {i}: {issue['correction']}")
156
+ lines.append("")
157
+ return "\n".join(lines)