jashdoshi77 commited on
Commit
a62955e
Β·
1 Parent(s): 7c2121a

added column smartness

Browse files
Files changed (1) hide show
  1. ai/signatures.py +96 -42
ai/signatures.py CHANGED
@@ -10,33 +10,79 @@ Consolidated from 8 signatures down to 4 to minimize LLM round-trips:
10
  import dspy
11
 
12
 
13
- # ── 1. Analyze & Plan (combines 3 former stages) ───────────────────────────
14
 
15
  class AnalyzeAndPlan(dspy.Signature):
16
  """You are an expert SQL analyst with strong business intelligence skills.
17
  Given a user question, a database schema, and a DATA PROFILE showing actual
18
  values in the database, analyze the question and produce a detailed query plan.
19
 
20
- CRITICAL BUSINESS RULES β€” you MUST follow these:
21
- 1. When calculating revenue, sales, or monetary metrics, ONLY include
22
- records with a completed/closed/successful status. Filter out cancelled,
23
- pending, open, returned, or failed records.
24
- 2. Look at the data profile to see which status/categorical values exist
25
- and decide which ones represent VALID/COMPLETED transactions.
26
- 3. For AOV (Average Order Value), divide total revenue of CLOSED orders
27
- by the COUNT of CLOSED orders only.
28
- 4. When a column like 'status' exists, ALWAYS consider whether filtering
29
- by status is needed for accurate business metrics.
30
- 5. For inventory/stock metrics, consider item states appropriately.
31
- 6. When computing counts, totals, or averages, think about which records
32
- should logically be included vs excluded.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  Steps:
35
- 1. Understand the user's question (intent, metrics, entities, filters)
36
- 2. Review the DATA PROFILE to understand actual values in the database
37
- 3. Identify which tables and columns are relevant
38
- 4. Determine appropriate filters (especially status-based) for accurate results
39
- 5. Produce a complete logical query plan"""
40
 
41
  question = dspy.InputField(desc="The user's natural-language question")
42
  schema_info = dspy.InputField(desc="Full database schema with table names, columns, and types")
@@ -44,38 +90,44 @@ class AnalyzeAndPlan(dspy.Signature):
44
  data_profile = dspy.InputField(desc="Data profile showing actual values: distinct categorical values, numeric ranges, date ranges")
45
 
46
  intent = dspy.OutputField(desc="What the user wants to know (1 sentence)")
47
- relevant_tables = dspy.OutputField(desc="Comma-separated list of tables needed")
48
  relevant_columns = dspy.OutputField(desc="Comma-separated list of table.column pairs needed")
49
  join_conditions = dspy.OutputField(desc="JOIN conditions to use, or 'none'")
50
- where_conditions = dspy.OutputField(desc="WHERE conditions including status/state filters for accurate business metrics, or 'none'")
51
  aggregations = dspy.OutputField(desc="Aggregation functions to apply, or 'none'")
52
  group_by = dspy.OutputField(desc="GROUP BY columns, or 'none'")
53
  order_by = dspy.OutputField(desc="ORDER BY clause, or 'none'")
54
  limit_val = dspy.OutputField(desc="LIMIT value, or 'none'")
55
 
56
 
57
- # ── 2. SQL Generation ──────────────────────────────────────────────────────
58
 
59
  class SQLGeneration(dspy.Signature):
60
  """Generate a valid PostgreSQL SELECT query based on the query plan.
61
  The query must be syntactically correct and only reference existing
62
  tables and columns from the schema.
63
 
64
- SIMPLICITY RULES (MUST FOLLOW):
65
- - If a pre-computed total/summary column exists (e.g. total_amount, grand_total,
66
- total_price, net_amount), SELECT THAT COLUMN DIRECTLY. NEVER reconstruct it
67
- by adding component columns (e.g. gold_amount + diamond_amount) β€” that will give
68
- wrong answers because it ignores labour, taxes, and other components.
69
- - For single-record lookups (e.g. "total amount of PO12345"), write:
70
- SELECT total_amount FROM <table> WHERE po_id = 'PO12345'
71
- NOT a multi-table join with SUM of parts.
72
- - Only JOIN tables if the required column does not exist in the primary table.
73
- - Only use aggregation (SUM, COUNT, AVG, etc.) when the question genuinely asks
74
- for an aggregate across multiple rows.
75
-
76
- BUSINESS RULES:
77
- - Include status/state filters from the query plan for accurate metrics.
78
- - Ensure the query respects business logic (e.g., only closed orders for revenue).
 
 
 
 
 
 
79
 
80
  CRITICAL: Output ONLY the raw SQL. No markdown, no explanation, no comments."""
81
 
@@ -85,12 +137,14 @@ class SQLGeneration(dspy.Signature):
85
 
86
  sql_query = dspy.OutputField(
87
  desc="The SIMPLEST valid PostgreSQL SELECT query that correctly answers the question. "
88
- "Use pre-computed total columns when available. Avoid unnecessary joins and aggregations. "
89
- "Output ONLY the raw SQL code β€” no markdown, no explanation, no code fences."
 
 
90
  )
91
 
92
 
93
- # ── 3. SQL Self-Critique & Repair (combined) ───────────────────────────────
94
 
95
  class SQLCritiqueAndFix(dspy.Signature):
96
  """Evaluate a generated SQL query for correctness against the schema.
@@ -110,7 +164,7 @@ class SQLCritiqueAndFix(dspy.Signature):
110
  )
111
 
112
 
113
- # ── 4. Interpret & Insight (combined) ──────────────────────────────────────
114
 
115
  class InterpretAndInsight(dspy.Signature):
116
  """Interpret SQL query results for a non-technical user and generate insights.
@@ -140,7 +194,7 @@ class InterpretAndInsight(dspy.Signature):
140
  )
141
 
142
 
143
- # ── 5. SQL Repair (for execution errors) ──────────────────────────────────
144
 
145
  class SQLRepair(dspy.Signature):
146
  """Given a SQL query that produced a database error, generate a
 
10
  import dspy
11
 
12
 
13
+ # ── 1. Analyze & Plan ──────────────────────────────────────────────────────────
14
 
15
  class AnalyzeAndPlan(dspy.Signature):
16
  """You are an expert SQL analyst with strong business intelligence skills.
17
  Given a user question, a database schema, and a DATA PROFILE showing actual
18
  values in the database, analyze the question and produce a detailed query plan.
19
 
20
+ ══════════════════════════════════════════════════════════════
21
+ RULE 0 β€” SIMPLICITY FIRST (HIGHEST PRIORITY)
22
+ ══════════════════════════════════════════════════════════════
23
+ Always use the SIMPLEST possible query that correctly answers the question.
24
+ - If a pre-computed total/summary column already exists in the schema
25
+ (e.g. total_amount, grand_total, total_price), USE IT DIRECTLY.
26
+ NEVER reconstruct it by summing component columns β€” that is always WRONG
27
+ because it misses labour, taxes, making charges, and other components.
28
+ - For single-record lookups (e.g. "total amount of PO12345"), just filter
29
+ and SELECT that column. No extra joins, no SUM.
30
+ - Only JOIN tables when the required column does not exist in the primary table.
31
+ - Only aggregate (SUM, COUNT, AVG) when the question genuinely asks for an
32
+ aggregate across multiple rows.
33
+
34
+ ══════════════════════════════════════════════════════════════
35
+ RULE 1 β€” WHICH COLUMN TO USE (CRITICAL β€” READ CAREFULLY)
36
+ ══════════════════════════════════════════════════════════════
37
+
38
+ ORDER-LEVEL QUESTIONS (revenue, AOV, total sales, order value, total amount):
39
+ β†’ Use: sales_table_v2_sales_order.total_amount
40
+ β†’ This is the PRE-COMPUTED grand total per order (includes all items,
41
+ gold, diamonds, making charges, labour, taxes).
42
+ β†’ Examples: "total revenue", "AOV", "average order value", "total sales",
43
+ "how much did customer X spend", "total amount of order SO123".
44
+ β†’ Formula:
45
+ Revenue = SUM(total_amount) FROM sales_order WHERE status = 'closed'
46
+ AOV = AVG(total_amount) FROM sales_order WHERE status = 'closed'
47
+ OR = SUM(total_amount) / COUNT(DISTINCT so_id) WHERE status = 'closed'
48
+ β†’ NEVER use line_total from sales_order_line_pricing for these β€” it is a
49
+ per-line amount and will give wrong results.
50
+
51
+ LINE-ITEM / PRODUCT-LEVEL QUESTIONS (per-product revenue, top products by sales):
52
+ β†’ Use: sales_table_v2_sales_order_line_pricing.line_total
53
+ β†’ Use ONLY when the question is about individual product/SKU performance.
54
+ β†’ Examples: "revenue per product", "top selling products by revenue",
55
+ "which product generates most sales".
56
+ β†’ JOIN path: sales_order β†’ sales_order_line β†’ sales_order_line_pricing
57
+ β†’ Still filter by sales_order.status = 'closed'.
58
+
59
+ PURCHASE ORDER TOTALS:
60
+ β†’ Use: purchase_orders_v6_purchase_order.total_amount
61
+ β†’ For: "total amount of PO123", "PO value", "purchase order cost".
62
+ β†’ NEVER sum gold_amount + diamond_amount from PO line tables β€” that misses labour.
63
+
64
+ ══════════════════════════════════════════════════════════════
65
+ RULE 2 β€” STATUS FILTERING
66
+ ═════════════════���════════════════════════════════════════════
67
+ For ALL revenue, sales, AOV, and financial metrics:
68
+ β†’ WHERE status = 'closed' on sales_table_v2_sales_order
69
+ For product catalog or inventory questions: no status filter needed.
70
+
71
+ ══════════════════════════════════════════════════════════════
72
+ RULE 3 β€” DATE FILTERING
73
+ ══════════════════════════════════════════════════════════════
74
+ The order_date column is stored as TEXT in 'YYYY-MM-DD' format.
75
+ Use text comparisons for date filters:
76
+ β†’ "last year" (2024): order_date >= '2024-01-01' AND order_date <= '2024-12-31'
77
+ β†’ "this year" (2025): order_date >= '2025-01-01' AND order_date <= '2025-12-31'
78
+ β†’ "last month": use appropriate YYYY-MM-DD range.
79
 
80
  Steps:
81
+ 1. Identify: is this ORDER-LEVEL or LINE-ITEM-LEVEL or PO question?
82
+ 2. Pick the correct source column per RULE 1 above.
83
+ 3. Identify the MINIMUM tables needed (often just one table).
84
+ 4. Apply status and date filters as needed.
85
+ 5. Produce the simplest correct query plan."""
86
 
87
  question = dspy.InputField(desc="The user's natural-language question")
88
  schema_info = dspy.InputField(desc="Full database schema with table names, columns, and types")
 
90
  data_profile = dspy.InputField(desc="Data profile showing actual values: distinct categorical values, numeric ranges, date ranges")
91
 
92
  intent = dspy.OutputField(desc="What the user wants to know (1 sentence)")
93
+ relevant_tables = dspy.OutputField(desc="Comma-separated list of tables needed (minimum necessary)")
94
  relevant_columns = dspy.OutputField(desc="Comma-separated list of table.column pairs needed")
95
  join_conditions = dspy.OutputField(desc="JOIN conditions to use, or 'none'")
96
+ where_conditions = dspy.OutputField(desc="WHERE conditions including status/date filters, or 'none'")
97
  aggregations = dspy.OutputField(desc="Aggregation functions to apply, or 'none'")
98
  group_by = dspy.OutputField(desc="GROUP BY columns, or 'none'")
99
  order_by = dspy.OutputField(desc="ORDER BY clause, or 'none'")
100
  limit_val = dspy.OutputField(desc="LIMIT value, or 'none'")
101
 
102
 
103
+ # ── 2. SQL Generation ──────────────────────────────────────────────────────────
104
 
105
  class SQLGeneration(dspy.Signature):
106
  """Generate a valid PostgreSQL SELECT query based on the query plan.
107
  The query must be syntactically correct and only reference existing
108
  tables and columns from the schema.
109
 
110
+ CRITICAL RULES:
111
+
112
+ 1. USE PRE-COMPUTED TOTALS β€” NEVER RECONSTRUCT THEM:
113
+ - For order-level metrics (revenue, AOV): use sales_table_v2_sales_order.total_amount
114
+ - For PO totals: use purchase_orders_v6_purchase_order.total_amount
115
+ - NEVER add gold_amount + diamond_amount or any component columns β€”
116
+ that always gives the WRONG answer (misses labour, taxes, etc.)
117
+
118
+ 2. CORRECT FORMULAS:
119
+ - Revenue: SELECT SUM(total_amount) FROM sales_table_v2_sales_order WHERE status = 'closed'
120
+ - AOV: SELECT AVG(total_amount) FROM sales_table_v2_sales_order WHERE status = 'closed'
121
+ - Per-product revenue: SUM(line_total) FROM sales_order_line_pricing
122
+ JOIN sales_order_line JOIN sales_order WHERE status = 'closed'
123
+
124
+ 3. DATE FILTERING (order_date is TEXT 'YYYY-MM-DD'):
125
+ - Use: order_date >= 'YYYY-01-01' AND order_date <= 'YYYY-12-31'
126
+ - Do NOT use EXTRACT() or CAST() on order_date
127
+
128
+ 4. SIMPLICITY:
129
+ - Single-record lookup = simple WHERE filter, no aggregation
130
+ - Only JOIN when needed, only aggregate when needed
131
 
132
  CRITICAL: Output ONLY the raw SQL. No markdown, no explanation, no comments."""
133
 
 
137
 
138
  sql_query = dspy.OutputField(
139
  desc="The SIMPLEST valid PostgreSQL SELECT query that correctly answers the question. "
140
+ "Use pre-computed total_amount for order/PO totals. "
141
+ "Use AVG(total_amount) or SUM(total_amount)/COUNT(DISTINCT so_id) for AOV β€” "
142
+ "NEVER SUM or AVG of line_total for AOV. "
143
+ "Output ONLY raw SQL β€” no markdown, no explanation, no code fences."
144
  )
145
 
146
 
147
+ # ── 3. SQL Self-Critique & Repair ─────────────────────────────────────────────
148
 
149
  class SQLCritiqueAndFix(dspy.Signature):
150
  """Evaluate a generated SQL query for correctness against the schema.
 
164
  )
165
 
166
 
167
+ # ── 4. Interpret & Insight ────────────────────────────────────────────────────
168
 
169
  class InterpretAndInsight(dspy.Signature):
170
  """Interpret SQL query results for a non-technical user and generate insights.
 
194
  )
195
 
196
 
197
+ # ── 5. SQL Repair ─────────────────────────────────────────────────────────────
198
 
199
  class SQLRepair(dspy.Signature):
200
  """Given a SQL query that produced a database error, generate a