Spaces:
Running
Running
Commit Β·
4aa0ed6
1
Parent(s): a62955e
fix: inject date context and add PO vs SO disambiguation rules
Browse files- ai/pipeline.py +26 -8
- ai/signatures.py +57 -14
ai/pipeline.py
CHANGED
|
@@ -10,6 +10,7 @@ Reduced from 9 stages to 4 LLM calls in the happy path:
|
|
| 10 |
import json
|
| 11 |
import logging
|
| 12 |
import re
|
|
|
|
| 13 |
from typing import Any
|
| 14 |
|
| 15 |
import dspy
|
|
@@ -48,16 +49,32 @@ class SQLAnalystPipeline:
|
|
| 48 |
|
| 49 |
# ββ public API ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
def run(self, question: str) -> dict[str, Any]:
|
| 52 |
"""Run the full pipeline and return {sql, data, answer, insights}."""
|
| 53 |
schema_str = format_schema()
|
| 54 |
rels_str = format_relationships()
|
| 55 |
profile_str = get_data_profile()
|
| 56 |
|
|
|
|
|
|
|
| 57 |
# 1. Analyze & Plan (single LLM call replaces 3 former stages)
|
| 58 |
logger.info("Stage 1 β Analyze & Plan")
|
| 59 |
plan = self.analyze(
|
| 60 |
-
question=
|
| 61 |
schema_info=schema_str,
|
| 62 |
relationships=rels_str,
|
| 63 |
data_profile=profile_str,
|
|
@@ -78,7 +95,7 @@ class SQLAnalystPipeline:
|
|
| 78 |
# 2. SQL Generation
|
| 79 |
logger.info("Stage 2 β SQL Generation")
|
| 80 |
sql_result = self.generate_sql(
|
| 81 |
-
question=
|
| 82 |
schema_info=schema_str,
|
| 83 |
query_plan=plan_text,
|
| 84 |
)
|
|
@@ -90,9 +107,8 @@ class SQLAnalystPipeline:
|
|
| 90 |
schema_valid, schema_issues = check_sql_against_schema(sql, get_schema())
|
| 91 |
if not schema_valid:
|
| 92 |
logger.warning(f"Schema issues detected: {schema_issues}")
|
| 93 |
-
# Try regenerating SQL once with the issues as feedback
|
| 94 |
sql_result = self.generate_sql(
|
| 95 |
-
question=
|
| 96 |
schema_info=schema_str,
|
| 97 |
query_plan=plan_text + f"\n\nPREVIOUS SQL HAD ISSUES: {schema_issues}. Fix them.",
|
| 98 |
)
|
|
@@ -120,7 +136,7 @@ class SQLAnalystPipeline:
|
|
| 120 |
sql_query=sql,
|
| 121 |
error_message=exec_result["error"],
|
| 122 |
schema_info=schema_str,
|
| 123 |
-
question=
|
| 124 |
)
|
| 125 |
sql = self._clean_sql(repair_result.corrected_sql)
|
| 126 |
is_safe, reason = validate_sql(sql)
|
|
@@ -166,8 +182,10 @@ class SQLAnalystPipeline:
|
|
| 166 |
rels_str = format_relationships()
|
| 167 |
profile_str = get_data_profile()
|
| 168 |
|
|
|
|
|
|
|
| 169 |
plan = self.analyze(
|
| 170 |
-
question=
|
| 171 |
schema_info=schema_str,
|
| 172 |
relationships=rels_str,
|
| 173 |
data_profile=profile_str,
|
|
@@ -186,7 +204,7 @@ class SQLAnalystPipeline:
|
|
| 186 |
)
|
| 187 |
|
| 188 |
sql_result = self.generate_sql(
|
| 189 |
-
question=
|
| 190 |
schema_info=schema_str,
|
| 191 |
query_plan=plan_text,
|
| 192 |
)
|
|
@@ -197,7 +215,7 @@ class SQLAnalystPipeline:
|
|
| 197 |
schema_valid, schema_issues = check_sql_against_schema(sql, get_schema())
|
| 198 |
if not schema_valid:
|
| 199 |
sql_result = self.generate_sql(
|
| 200 |
-
question=
|
| 201 |
schema_info=schema_str,
|
| 202 |
query_plan=plan_text + f"\n\nPREVIOUS SQL HAD ISSUES: {schema_issues}. Fix them.",
|
| 203 |
)
|
|
|
|
| 10 |
import json
|
| 11 |
import logging
|
| 12 |
import re
|
| 13 |
+
from datetime import date
|
| 14 |
from typing import Any
|
| 15 |
|
| 16 |
import dspy
|
|
|
|
| 49 |
|
| 50 |
# ββ public API ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 51 |
|
| 52 |
+
@staticmethod
|
| 53 |
+
def _build_question_with_context(question: str) -> str:
|
| 54 |
+
"""Prepend today's date so the model can resolve relative time references."""
|
| 55 |
+
today = date.today()
|
| 56 |
+
current_year = today.year
|
| 57 |
+
last_year = current_year - 1
|
| 58 |
+
return (
|
| 59 |
+
f"[CONTEXT: Today is {today.isoformat()}. "
|
| 60 |
+
f"Current year = {current_year}. "
|
| 61 |
+
f"'Last year' = {last_year} ({last_year}-01-01 to {last_year}-12-31). "
|
| 62 |
+
f"'This year' = {current_year} ({current_year}-01-01 to {current_year}-12-31).]\n\n"
|
| 63 |
+
f"{question}"
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
def run(self, question: str) -> dict[str, Any]:
|
| 67 |
"""Run the full pipeline and return {sql, data, answer, insights}."""
|
| 68 |
schema_str = format_schema()
|
| 69 |
rels_str = format_relationships()
|
| 70 |
profile_str = get_data_profile()
|
| 71 |
|
| 72 |
+
question_with_date = self._build_question_with_context(question)
|
| 73 |
+
|
| 74 |
# 1. Analyze & Plan (single LLM call replaces 3 former stages)
|
| 75 |
logger.info("Stage 1 β Analyze & Plan")
|
| 76 |
plan = self.analyze(
|
| 77 |
+
question=question_with_date,
|
| 78 |
schema_info=schema_str,
|
| 79 |
relationships=rels_str,
|
| 80 |
data_profile=profile_str,
|
|
|
|
| 95 |
# 2. SQL Generation
|
| 96 |
logger.info("Stage 2 β SQL Generation")
|
| 97 |
sql_result = self.generate_sql(
|
| 98 |
+
question=question_with_date,
|
| 99 |
schema_info=schema_str,
|
| 100 |
query_plan=plan_text,
|
| 101 |
)
|
|
|
|
| 107 |
schema_valid, schema_issues = check_sql_against_schema(sql, get_schema())
|
| 108 |
if not schema_valid:
|
| 109 |
logger.warning(f"Schema issues detected: {schema_issues}")
|
|
|
|
| 110 |
sql_result = self.generate_sql(
|
| 111 |
+
question=question_with_date,
|
| 112 |
schema_info=schema_str,
|
| 113 |
query_plan=plan_text + f"\n\nPREVIOUS SQL HAD ISSUES: {schema_issues}. Fix them.",
|
| 114 |
)
|
|
|
|
| 136 |
sql_query=sql,
|
| 137 |
error_message=exec_result["error"],
|
| 138 |
schema_info=schema_str,
|
| 139 |
+
question=question_with_date,
|
| 140 |
)
|
| 141 |
sql = self._clean_sql(repair_result.corrected_sql)
|
| 142 |
is_safe, reason = validate_sql(sql)
|
|
|
|
| 182 |
rels_str = format_relationships()
|
| 183 |
profile_str = get_data_profile()
|
| 184 |
|
| 185 |
+
question_with_date = self._build_question_with_context(question)
|
| 186 |
+
|
| 187 |
plan = self.analyze(
|
| 188 |
+
question=question_with_date,
|
| 189 |
schema_info=schema_str,
|
| 190 |
relationships=rels_str,
|
| 191 |
data_profile=profile_str,
|
|
|
|
| 204 |
)
|
| 205 |
|
| 206 |
sql_result = self.generate_sql(
|
| 207 |
+
question=question_with_date,
|
| 208 |
schema_info=schema_str,
|
| 209 |
query_plan=plan_text,
|
| 210 |
)
|
|
|
|
| 215 |
schema_valid, schema_issues = check_sql_against_schema(sql, get_schema())
|
| 216 |
if not schema_valid:
|
| 217 |
sql_result = self.generate_sql(
|
| 218 |
+
question=question_with_date,
|
| 219 |
schema_info=schema_str,
|
| 220 |
query_plan=plan_text + f"\n\nPREVIOUS SQL HAD ISSUES: {schema_issues}. Fix them.",
|
| 221 |
)
|
ai/signatures.py
CHANGED
|
@@ -68,21 +68,53 @@ class AnalyzeAndPlan(dspy.Signature):
|
|
| 68 |
β WHERE status = 'closed' on sales_table_v2_sales_order
|
| 69 |
For product catalog or inventory questions: no status filter needed.
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 72 |
RULE 3 β DATE FILTERING
|
| 73 |
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
The order_date column is stored as TEXT in 'YYYY-MM-DD' format.
|
| 75 |
-
Use text comparisons
|
| 76 |
-
β
|
| 77 |
-
β "
|
| 78 |
-
β "
|
|
|
|
| 79 |
|
| 80 |
Steps:
|
| 81 |
-
1.
|
| 82 |
-
2.
|
| 83 |
-
3.
|
| 84 |
-
4.
|
| 85 |
-
5.
|
|
|
|
| 86 |
|
| 87 |
question = dspy.InputField(desc="The user's natural-language question")
|
| 88 |
schema_info = dspy.InputField(desc="Full database schema with table names, columns, and types")
|
|
@@ -109,23 +141,34 @@ class SQLGeneration(dspy.Signature):
|
|
| 109 |
|
| 110 |
CRITICAL RULES:
|
| 111 |
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
- For order-level metrics (revenue, AOV): use sales_table_v2_sales_order.total_amount
|
| 114 |
- For PO totals: use purchase_orders_v6_purchase_order.total_amount
|
| 115 |
- NEVER add gold_amount + diamond_amount or any component columns β
|
| 116 |
that always gives the WRONG answer (misses labour, taxes, etc.)
|
| 117 |
|
| 118 |
-
|
| 119 |
- Revenue: SELECT SUM(total_amount) FROM sales_table_v2_sales_order WHERE status = 'closed'
|
| 120 |
- AOV: SELECT AVG(total_amount) FROM sales_table_v2_sales_order WHERE status = 'closed'
|
| 121 |
- Per-product revenue: SUM(line_total) FROM sales_order_line_pricing
|
| 122 |
JOIN sales_order_line JOIN sales_order WHERE status = 'closed'
|
| 123 |
|
| 124 |
-
|
|
|
|
| 125 |
- Use: order_date >= 'YYYY-01-01' AND order_date <= 'YYYY-12-31'
|
| 126 |
-
- Do NOT use EXTRACT() or CAST() on order_date
|
| 127 |
|
| 128 |
-
|
| 129 |
- Single-record lookup = simple WHERE filter, no aggregation
|
| 130 |
- Only JOIN when needed, only aggregate when needed
|
| 131 |
|
|
|
|
| 68 |
β WHERE status = 'closed' on sales_table_v2_sales_order
|
| 69 |
For product catalog or inventory questions: no status filter needed.
|
| 70 |
|
| 71 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 72 |
+
RULE 2.5 β SALES ORDER vs PURCHASE ORDER DISAMBIGUATION
|
| 73 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 74 |
+
There are TWO completely separate order systems. NEVER confuse them.
|
| 75 |
+
|
| 76 |
+
SALES ORDERS (outgoing β what customers buy from us):
|
| 77 |
+
β Primary table: sales_table_v2_sales_order
|
| 78 |
+
β IDs start with "SO" (e.g. SO13579)
|
| 79 |
+
β Keywords: "sales order", "order", "customer order", "AOV", "revenue",
|
| 80 |
+
"highest order", "best order", "what customers spent", "order value"
|
| 81 |
+
β Example questions β sales_order table:
|
| 82 |
+
"highest order" / "biggest sale" / "top sales order" / "total revenue" / "AOV"
|
| 83 |
+
|
| 84 |
+
PURCHASE ORDERS (incoming β what we buy from vendors/suppliers):
|
| 85 |
+
β Primary table: purchase_orders_v6_purchase_order
|
| 86 |
+
β IDs start with "PO" (e.g. PO08796)
|
| 87 |
+
β Keywords: "purchase order", "PO", "vendor order", "supplier order",
|
| 88 |
+
"highest purchase order", "best PO", "what we ordered from vendors"
|
| 89 |
+
β Example questions β purchase_order table:
|
| 90 |
+
"highest purchase order" / "total PO value" / "amount of PO12345"
|
| 91 |
+
|
| 92 |
+
DISAMBIGUATION RULE:
|
| 93 |
+
- If question mentions "purchase order", "PO", "vendor" β use purchase_orders_v6 tables.
|
| 94 |
+
- If question mentions "sales order", "order", "revenue", "customer" β use sales_table_v2 tables.
|
| 95 |
+
- If ambiguous and no "purchase" keyword β default to sales_table_v2_sales_order.
|
| 96 |
+
|
| 97 |
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 98 |
RULE 3 β DATE FILTERING
|
| 99 |
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 100 |
+
The question includes a [CONTEXT] block at the top with today's date,
|
| 101 |
+
current year, and exact date ranges for "last year" and "this year".
|
| 102 |
+
ALWAYS read and use those exact date ranges from the [CONTEXT] block.
|
| 103 |
+
|
| 104 |
The order_date column is stored as TEXT in 'YYYY-MM-DD' format.
|
| 105 |
+
Use text comparisons ONLY β never EXTRACT() or CAST():
|
| 106 |
+
β Use the ranges exactly as given in the [CONTEXT] block.
|
| 107 |
+
β "last year": order_date >= '<last_year>-01-01' AND order_date <= '<last_year>-12-31'
|
| 108 |
+
β "this year": order_date >= '<current_year>-01-01' AND order_date <= '<current_year>-12-31'
|
| 109 |
+
β "last month": use appropriate YYYY-MM-DD range relative to today's date.
|
| 110 |
|
| 111 |
Steps:
|
| 112 |
+
1. READ the [CONTEXT] block to get current year and last year values.
|
| 113 |
+
2. Identify: is this SALES ORDER, PURCHASE ORDER, or LINE-ITEM question? (see Rule 2.5)
|
| 114 |
+
3. Pick the correct table and source column per RULE 1 and RULE 2.5.
|
| 115 |
+
4. Identify the MINIMUM tables needed (often just one table).
|
| 116 |
+
5. Apply status and date filters as needed using the exact dates from [CONTEXT].
|
| 117 |
+
6. Produce the simplest correct query plan."""
|
| 118 |
|
| 119 |
question = dspy.InputField(desc="The user's natural-language question")
|
| 120 |
schema_info = dspy.InputField(desc="Full database schema with table names, columns, and types")
|
|
|
|
| 141 |
|
| 142 |
CRITICAL RULES:
|
| 143 |
|
| 144 |
+
0. READ THE [CONTEXT] BLOCK IN THE QUESTION:
|
| 145 |
+
- It tells you today's date, current year, and exact date ranges for "last year"/"this year".
|
| 146 |
+
- Always use those exact year values. NEVER guess the year.
|
| 147 |
+
|
| 148 |
+
1. SALES ORDER vs PURCHASE ORDER β NEVER CONFUSE THEM:
|
| 149 |
+
- "purchase order", "PO", "vendor" β purchase_orders_v6_purchase_order table
|
| 150 |
+
- "sales order", "order", "revenue", "AOV", "highest order" (without "purchase") β sales_table_v2_sales_order table
|
| 151 |
+
- Highest/biggest/top "purchase order" β purchase_orders_v6_purchase_order ORDER BY total_amount DESC
|
| 152 |
+
- Highest/biggest/top "order" or "sale" οΏ½οΏ½οΏ½ sales_table_v2_sales_order ORDER BY total_amount DESC
|
| 153 |
+
|
| 154 |
+
2. USE PRE-COMPUTED TOTALS β NEVER RECONSTRUCT THEM:
|
| 155 |
- For order-level metrics (revenue, AOV): use sales_table_v2_sales_order.total_amount
|
| 156 |
- For PO totals: use purchase_orders_v6_purchase_order.total_amount
|
| 157 |
- NEVER add gold_amount + diamond_amount or any component columns β
|
| 158 |
that always gives the WRONG answer (misses labour, taxes, etc.)
|
| 159 |
|
| 160 |
+
3. CORRECT FORMULAS:
|
| 161 |
- Revenue: SELECT SUM(total_amount) FROM sales_table_v2_sales_order WHERE status = 'closed'
|
| 162 |
- AOV: SELECT AVG(total_amount) FROM sales_table_v2_sales_order WHERE status = 'closed'
|
| 163 |
- Per-product revenue: SUM(line_total) FROM sales_order_line_pricing
|
| 164 |
JOIN sales_order_line JOIN sales_order WHERE status = 'closed'
|
| 165 |
|
| 166 |
+
4. DATE FILTERING (order_date is TEXT 'YYYY-MM-DD'):
|
| 167 |
+
- Use the EXACT year values from the [CONTEXT] block in the question.
|
| 168 |
- Use: order_date >= 'YYYY-01-01' AND order_date <= 'YYYY-12-31'
|
| 169 |
+
- Do NOT use EXTRACT() or CAST() on order_date.
|
| 170 |
|
| 171 |
+
5. SIMPLICITY:
|
| 172 |
- Single-record lookup = simple WHERE filter, no aggregation
|
| 173 |
- Only JOIN when needed, only aggregate when needed
|
| 174 |
|