jashdoshi77 commited on
Commit
4aa0ed6
Β·
1 Parent(s): a62955e

fix: inject date context and add PO vs SO disambiguation rules

Browse files
Files changed (2) hide show
  1. ai/pipeline.py +26 -8
  2. ai/signatures.py +57 -14
ai/pipeline.py CHANGED
@@ -10,6 +10,7 @@ Reduced from 9 stages to 4 LLM calls in the happy path:
10
  import json
11
  import logging
12
  import re
 
13
  from typing import Any
14
 
15
  import dspy
@@ -48,16 +49,32 @@ class SQLAnalystPipeline:
48
 
49
  # ── public API ──────────────────────────────────────────────────────
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  def run(self, question: str) -> dict[str, Any]:
52
  """Run the full pipeline and return {sql, data, answer, insights}."""
53
  schema_str = format_schema()
54
  rels_str = format_relationships()
55
  profile_str = get_data_profile()
56
 
 
 
57
  # 1. Analyze & Plan (single LLM call replaces 3 former stages)
58
  logger.info("Stage 1 β€” Analyze & Plan")
59
  plan = self.analyze(
60
- question=question,
61
  schema_info=schema_str,
62
  relationships=rels_str,
63
  data_profile=profile_str,
@@ -78,7 +95,7 @@ class SQLAnalystPipeline:
78
  # 2. SQL Generation
79
  logger.info("Stage 2 β€” SQL Generation")
80
  sql_result = self.generate_sql(
81
- question=question,
82
  schema_info=schema_str,
83
  query_plan=plan_text,
84
  )
@@ -90,9 +107,8 @@ class SQLAnalystPipeline:
90
  schema_valid, schema_issues = check_sql_against_schema(sql, get_schema())
91
  if not schema_valid:
92
  logger.warning(f"Schema issues detected: {schema_issues}")
93
- # Try regenerating SQL once with the issues as feedback
94
  sql_result = self.generate_sql(
95
- question=question,
96
  schema_info=schema_str,
97
  query_plan=plan_text + f"\n\nPREVIOUS SQL HAD ISSUES: {schema_issues}. Fix them.",
98
  )
@@ -120,7 +136,7 @@ class SQLAnalystPipeline:
120
  sql_query=sql,
121
  error_message=exec_result["error"],
122
  schema_info=schema_str,
123
- question=question,
124
  )
125
  sql = self._clean_sql(repair_result.corrected_sql)
126
  is_safe, reason = validate_sql(sql)
@@ -166,8 +182,10 @@ class SQLAnalystPipeline:
166
  rels_str = format_relationships()
167
  profile_str = get_data_profile()
168
 
 
 
169
  plan = self.analyze(
170
- question=question,
171
  schema_info=schema_str,
172
  relationships=rels_str,
173
  data_profile=profile_str,
@@ -186,7 +204,7 @@ class SQLAnalystPipeline:
186
  )
187
 
188
  sql_result = self.generate_sql(
189
- question=question,
190
  schema_info=schema_str,
191
  query_plan=plan_text,
192
  )
@@ -197,7 +215,7 @@ class SQLAnalystPipeline:
197
  schema_valid, schema_issues = check_sql_against_schema(sql, get_schema())
198
  if not schema_valid:
199
  sql_result = self.generate_sql(
200
- question=question,
201
  schema_info=schema_str,
202
  query_plan=plan_text + f"\n\nPREVIOUS SQL HAD ISSUES: {schema_issues}. Fix them.",
203
  )
 
10
  import json
11
  import logging
12
  import re
13
+ from datetime import date
14
  from typing import Any
15
 
16
  import dspy
 
49
 
50
  # ── public API ──────────────────────────────────────────────────────
51
 
52
+ @staticmethod
53
+ def _build_question_with_context(question: str) -> str:
54
+ """Prepend today's date so the model can resolve relative time references."""
55
+ today = date.today()
56
+ current_year = today.year
57
+ last_year = current_year - 1
58
+ return (
59
+ f"[CONTEXT: Today is {today.isoformat()}. "
60
+ f"Current year = {current_year}. "
61
+ f"'Last year' = {last_year} ({last_year}-01-01 to {last_year}-12-31). "
62
+ f"'This year' = {current_year} ({current_year}-01-01 to {current_year}-12-31).]\n\n"
63
+ f"{question}"
64
+ )
65
+
66
  def run(self, question: str) -> dict[str, Any]:
67
  """Run the full pipeline and return {sql, data, answer, insights}."""
68
  schema_str = format_schema()
69
  rels_str = format_relationships()
70
  profile_str = get_data_profile()
71
 
72
+ question_with_date = self._build_question_with_context(question)
73
+
74
  # 1. Analyze & Plan (single LLM call replaces 3 former stages)
75
  logger.info("Stage 1 β€” Analyze & Plan")
76
  plan = self.analyze(
77
+ question=question_with_date,
78
  schema_info=schema_str,
79
  relationships=rels_str,
80
  data_profile=profile_str,
 
95
  # 2. SQL Generation
96
  logger.info("Stage 2 β€” SQL Generation")
97
  sql_result = self.generate_sql(
98
+ question=question_with_date,
99
  schema_info=schema_str,
100
  query_plan=plan_text,
101
  )
 
107
  schema_valid, schema_issues = check_sql_against_schema(sql, get_schema())
108
  if not schema_valid:
109
  logger.warning(f"Schema issues detected: {schema_issues}")
 
110
  sql_result = self.generate_sql(
111
+ question=question_with_date,
112
  schema_info=schema_str,
113
  query_plan=plan_text + f"\n\nPREVIOUS SQL HAD ISSUES: {schema_issues}. Fix them.",
114
  )
 
136
  sql_query=sql,
137
  error_message=exec_result["error"],
138
  schema_info=schema_str,
139
+ question=question_with_date,
140
  )
141
  sql = self._clean_sql(repair_result.corrected_sql)
142
  is_safe, reason = validate_sql(sql)
 
182
  rels_str = format_relationships()
183
  profile_str = get_data_profile()
184
 
185
+ question_with_date = self._build_question_with_context(question)
186
+
187
  plan = self.analyze(
188
+ question=question_with_date,
189
  schema_info=schema_str,
190
  relationships=rels_str,
191
  data_profile=profile_str,
 
204
  )
205
 
206
  sql_result = self.generate_sql(
207
+ question=question_with_date,
208
  schema_info=schema_str,
209
  query_plan=plan_text,
210
  )
 
215
  schema_valid, schema_issues = check_sql_against_schema(sql, get_schema())
216
  if not schema_valid:
217
  sql_result = self.generate_sql(
218
+ question=question_with_date,
219
  schema_info=schema_str,
220
  query_plan=plan_text + f"\n\nPREVIOUS SQL HAD ISSUES: {schema_issues}. Fix them.",
221
  )
ai/signatures.py CHANGED
@@ -68,21 +68,53 @@ class AnalyzeAndPlan(dspy.Signature):
68
  β†’ WHERE status = 'closed' on sales_table_v2_sales_order
69
  For product catalog or inventory questions: no status filter needed.
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  ══════════════════════════════════════════════════════════════
72
  RULE 3 β€” DATE FILTERING
73
  ══════════════════════════════════════════════════════════════
 
 
 
 
74
  The order_date column is stored as TEXT in 'YYYY-MM-DD' format.
75
- Use text comparisons for date filters:
76
- β†’ "last year" (2024): order_date >= '2024-01-01' AND order_date <= '2024-12-31'
77
- β†’ "this year" (2025): order_date >= '2025-01-01' AND order_date <= '2025-12-31'
78
- β†’ "last month": use appropriate YYYY-MM-DD range.
 
79
 
80
  Steps:
81
- 1. Identify: is this ORDER-LEVEL or LINE-ITEM-LEVEL or PO question?
82
- 2. Pick the correct source column per RULE 1 above.
83
- 3. Identify the MINIMUM tables needed (often just one table).
84
- 4. Apply status and date filters as needed.
85
- 5. Produce the simplest correct query plan."""
 
86
 
87
  question = dspy.InputField(desc="The user's natural-language question")
88
  schema_info = dspy.InputField(desc="Full database schema with table names, columns, and types")
@@ -109,23 +141,34 @@ class SQLGeneration(dspy.Signature):
109
 
110
  CRITICAL RULES:
111
 
112
- 1. USE PRE-COMPUTED TOTALS β€” NEVER RECONSTRUCT THEM:
 
 
 
 
 
 
 
 
 
 
113
  - For order-level metrics (revenue, AOV): use sales_table_v2_sales_order.total_amount
114
  - For PO totals: use purchase_orders_v6_purchase_order.total_amount
115
  - NEVER add gold_amount + diamond_amount or any component columns β€”
116
  that always gives the WRONG answer (misses labour, taxes, etc.)
117
 
118
- 2. CORRECT FORMULAS:
119
  - Revenue: SELECT SUM(total_amount) FROM sales_table_v2_sales_order WHERE status = 'closed'
120
  - AOV: SELECT AVG(total_amount) FROM sales_table_v2_sales_order WHERE status = 'closed'
121
  - Per-product revenue: SUM(line_total) FROM sales_order_line_pricing
122
  JOIN sales_order_line JOIN sales_order WHERE status = 'closed'
123
 
124
- 3. DATE FILTERING (order_date is TEXT 'YYYY-MM-DD'):
 
125
  - Use: order_date >= 'YYYY-01-01' AND order_date <= 'YYYY-12-31'
126
- - Do NOT use EXTRACT() or CAST() on order_date
127
 
128
- 4. SIMPLICITY:
129
  - Single-record lookup = simple WHERE filter, no aggregation
130
  - Only JOIN when needed, only aggregate when needed
131
 
 
68
  β†’ WHERE status = 'closed' on sales_table_v2_sales_order
69
  For product catalog or inventory questions: no status filter needed.
70
 
71
+ ══════════════════════════════════════════════════════════════
72
+ RULE 2.5 β€” SALES ORDER vs PURCHASE ORDER DISAMBIGUATION
73
+ ══════════════════════════════════════════════════════════════
74
+ There are TWO completely separate order systems. NEVER confuse them.
75
+
76
+ SALES ORDERS (outgoing β€” what customers buy from us):
77
+ β†’ Primary table: sales_table_v2_sales_order
78
+ β†’ IDs start with "SO" (e.g. SO13579)
79
+ β†’ Keywords: "sales order", "order", "customer order", "AOV", "revenue",
80
+ "highest order", "best order", "what customers spent", "order value"
81
+ β†’ Example questions β†’ sales_order table:
82
+ "highest order" / "biggest sale" / "top sales order" / "total revenue" / "AOV"
83
+
84
+ PURCHASE ORDERS (incoming β€” what we buy from vendors/suppliers):
85
+ β†’ Primary table: purchase_orders_v6_purchase_order
86
+ β†’ IDs start with "PO" (e.g. PO08796)
87
+ β†’ Keywords: "purchase order", "PO", "vendor order", "supplier order",
88
+ "highest purchase order", "best PO", "what we ordered from vendors"
89
+ β†’ Example questions β†’ purchase_order table:
90
+ "highest purchase order" / "total PO value" / "amount of PO12345"
91
+
92
+ DISAMBIGUATION RULE:
93
+ - If question mentions "purchase order", "PO", "vendor" β†’ use purchase_orders_v6 tables.
94
+ - If question mentions "sales order", "order", "revenue", "customer" β†’ use sales_table_v2 tables.
95
+ - If ambiguous and no "purchase" keyword β†’ default to sales_table_v2_sales_order.
96
+
97
  ══════════════════════════════════════════════════════════════
98
  RULE 3 β€” DATE FILTERING
99
  ══════════════════════════════════════════════════════════════
100
+ The question includes a [CONTEXT] block at the top with today's date,
101
+ current year, and exact date ranges for "last year" and "this year".
102
+ ALWAYS read and use those exact date ranges from the [CONTEXT] block.
103
+
104
  The order_date column is stored as TEXT in 'YYYY-MM-DD' format.
105
+ Use text comparisons ONLY β€” never EXTRACT() or CAST():
106
+ β†’ Use the ranges exactly as given in the [CONTEXT] block.
107
+ β†’ "last year": order_date >= '<last_year>-01-01' AND order_date <= '<last_year>-12-31'
108
+ β†’ "this year": order_date >= '<current_year>-01-01' AND order_date <= '<current_year>-12-31'
109
+ β†’ "last month": use appropriate YYYY-MM-DD range relative to today's date.
110
 
111
  Steps:
112
+ 1. READ the [CONTEXT] block to get current year and last year values.
113
+ 2. Identify: is this SALES ORDER, PURCHASE ORDER, or LINE-ITEM question? (see Rule 2.5)
114
+ 3. Pick the correct table and source column per RULE 1 and RULE 2.5.
115
+ 4. Identify the MINIMUM tables needed (often just one table).
116
+ 5. Apply status and date filters as needed using the exact dates from [CONTEXT].
117
+ 6. Produce the simplest correct query plan."""
118
 
119
  question = dspy.InputField(desc="The user's natural-language question")
120
  schema_info = dspy.InputField(desc="Full database schema with table names, columns, and types")
 
141
 
142
  CRITICAL RULES:
143
 
144
+ 0. READ THE [CONTEXT] BLOCK IN THE QUESTION:
145
+ - It tells you today's date, current year, and exact date ranges for "last year"/"this year".
146
+ - Always use those exact year values. NEVER guess the year.
147
+
148
+ 1. SALES ORDER vs PURCHASE ORDER β€” NEVER CONFUSE THEM:
149
+ - "purchase order", "PO", "vendor" β†’ purchase_orders_v6_purchase_order table
150
+ - "sales order", "order", "revenue", "AOV", "highest order" (without "purchase") β†’ sales_table_v2_sales_order table
151
+ - Highest/biggest/top "purchase order" β†’ purchase_orders_v6_purchase_order ORDER BY total_amount DESC
152
+ - Highest/biggest/top "order" or "sale" οΏ½οΏ½οΏ½ sales_table_v2_sales_order ORDER BY total_amount DESC
153
+
154
+ 2. USE PRE-COMPUTED TOTALS β€” NEVER RECONSTRUCT THEM:
155
  - For order-level metrics (revenue, AOV): use sales_table_v2_sales_order.total_amount
156
  - For PO totals: use purchase_orders_v6_purchase_order.total_amount
157
  - NEVER add gold_amount + diamond_amount or any component columns β€”
158
  that always gives the WRONG answer (misses labour, taxes, etc.)
159
 
160
+ 3. CORRECT FORMULAS:
161
  - Revenue: SELECT SUM(total_amount) FROM sales_table_v2_sales_order WHERE status = 'closed'
162
  - AOV: SELECT AVG(total_amount) FROM sales_table_v2_sales_order WHERE status = 'closed'
163
  - Per-product revenue: SUM(line_total) FROM sales_order_line_pricing
164
  JOIN sales_order_line JOIN sales_order WHERE status = 'closed'
165
 
166
+ 4. DATE FILTERING (order_date is TEXT 'YYYY-MM-DD'):
167
+ - Use the EXACT year values from the [CONTEXT] block in the question.
168
  - Use: order_date >= 'YYYY-01-01' AND order_date <= 'YYYY-12-31'
169
+ - Do NOT use EXTRACT() or CAST() on order_date.
170
 
171
+ 5. SIMPLICITY:
172
  - Single-record lookup = simple WHERE filter, no aggregation
173
  - Only JOIN when needed, only aggregate when needed
174