Spaces:

Express-Analytics
/

QueryHelper

Runtime error

App Files Files Community

anumaurya114exp commited on Dec 19, 2023

Commit

11a349e

1 Parent(s): 7d0c63c

added CoT in select table then get query prompt style

Browse files

Files changed (1) hide show

queryHelperManager.py +83 -14

queryHelperManager.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from gptManager import ChatgptManager
 from utils import *
 import json
 from constants import TABLE_RELATIONS
 class QueryHelper:
@@ -54,7 +55,42 @@ class QueryHelper:
     promptForQuery = self.getSystemPromptForQuery(prospectTablesAndCols)
     self.gptInstanceForQuery.setSystemPrompt(promptForQuery)
     gptResponse = self.gptInstanceForQuery.getResponseForUserInput(userInput)
-    return gptResponse
   def getSystemPromptForTableCols(self):
     schemaName = self.schemaName
@@ -77,18 +113,51 @@ class QueryHelper:
     schemaName = self.schemaName
     platform = self.platform
     tableSummaryDict = json.load(open(self.tableSummaryJson, 'r'))
-    exampleQuery = """SELECT a.customer_id, COUNT(a.product_id) as chandelier_count
-FROM lpdatamart.tbl_f_sales a
-JOIN lpdatamart.tbl_d_product b ON a.product_id = b.product_id
-JOIN lpdatamart.tbl_d_calendar c ON a.date_id = c.date_id
-WHERE UPPER(b.product_name) LIKE '%CHANDELIER%' AND c.calendar_month = 'NOVEMBER' AND c.year = 2023
-GROUP BY a.customer_id
-ORDER BY chandelier_count DESC"""
-    question = "top 5 customers who bought most chandeliers in nov 2023"
-    promptForQuery = f"""You are a powerful text to sql model. Answer user input with sql query. Answer only sql query. And the query needs to run on {platform}. and schemaName is {schemaName}. There is example user input and desired generated sql query. Follow similar patterns as example. eg case insensitive, explicit variable declaration etc. user input : {question}, query : {exampleQuery}. and table's data is \n"""
     for idx, tableName in enumerate(prospectTablesAndCols.keys(), start=1):
-        promptForQuery += f"table name is {tableName}, table data is {self.sampleData[tableName][prospectTablesAndCols[tableName]].head(self.gptSampleRows)}"
-    promptForQuery += f"and table Relations are {TABLE_RELATIONS}"
-    return promptForQuery.replace("\\"," ").replace("  "," ").replace("XXXX", "    ")

 from gptManager import ChatgptManager
 from utils import *
 import json
+import sqlparse
 from constants import TABLE_RELATIONS
 class QueryHelper:
     promptForQuery = self.getSystemPromptForQuery(prospectTablesAndCols)
     self.gptInstanceForQuery.setSystemPrompt(promptForQuery)
     gptResponse = self.gptInstanceForQuery.getResponseForUserInput(userInput)
+    #following CoT in select column then get query to save tokens
+    tryParsing = True
+    parsedSql = False
+    if tryParsing:
+      try:
+        txt = gptResponse.split("```json")[-1].split('```')[0].replace('\n', '')
+        sqlResult = json.loads(txt)['finalResult']
+        parsedSql = True
+        tryParsing = False
+      except:
+        print("Couldn't parse desired result from gpt response using method 1.")
+    if tryParsing:
+      try:
+        sqlResult = json.loads(gptResponse)['finalResult']
+        parsedSql = True
+        tryParsing = False
+      except:
+        print("Couldn't parse desired result from gpt response using method 2")
+    if parsedSql:
+      isFormatted = False
+      try:
+        formattedSql = sqlparse.format(sqlResult, reindent=True)
+        responseToReturn = formattedSql
+        isFormatted = True
+      except:
+        isFormatted = False
+      if not isFormatted:
+        try:
+          formattedSql = sqlparse.format(sqlResult['result'], reindent=True)
+          responseToReturn = formattedSql
+          print("gpt didn't give parsed result. So parsing again. the formatting.")
+        except:
+          responseToReturn = str(sqlResult)
+    else:
+      responseToReturn = gptResponse
+    return responseToReturn
   def getSystemPromptForTableCols(self):
     schemaName = self.schemaName
     schemaName = self.schemaName
     platform = self.platform
     tableSummaryDict = json.load(open(self.tableSummaryJson, 'r'))
+    egUserInput = "I want to get top 5 product categories by state, then rank categories on decreasing order of total sales"
+    cotSubtaskOutput = """{
+          "subquery1": {
+            "inputSubquery": [],
+            "descriptioin":"calculate the total sales and assigns ranks to product categories within each state based on the descending order of sales in the tbl_f_sales table, utilizing joins with tbl_d_product and tbl_d_customer tables.",
+            "result": "SELECT c.state, b.category, SUM(a.transaction_amount) as total_sales,
+    RANK() OVER(PARTITION BY c.state ORDER BY SUM(a.transaction_amount) DESC) as category_rank
+    FROM lpdatamart.tbl_f_sales a
+    JOIN lpdatamart.tbl_d_product b
+    ON a.product_id = b.product_id
+    JOIN lpdatamart.tbl_d_customer c
+    ON a.customer_id = c.customer_id
+    GROUP BY c.state, b.category "
+          },
+          "subquery2": {
+            "inputSubquery": ["subquery1"],
+            "description":"extracts state, category, and total sales information from a subquery named "subquery1," filtering the results to include only categories with ranks up to 5 and sorting them by state and category rank."
+            "result":"SELECT state, category, total_sales
+FROM ranked_categories
+WHERE category_rank <= 5
+ORDER BY state, category_rank"
+          },
+          "finalResult":"WITH subquery1 AS (
+    SELECT c.state, b.category, SUM(a.transaction_amount) as total_sales,
+    RANK() OVER(PARTITION BY c.state ORDER BY SUM(a.transaction_amount) DESC) as category_rank
+    FROM lpdatamart.tbl_f_sales a
+    JOIN lpdatamart.tbl_d_product b
+    ON a.product_id = b.product_id
+    JOIN lpdatamart.tbl_d_customer c
+    ON a.customer_id = c.customer_id
+    GROUP BY c.state, b.category
+)
+SELECT state, category, total_sales
+FROM subquery1
+WHERE category_rank <= 5
+ORDER BY state, category_rank"
+        }"""
+    prompt = f"""You are a powerful text to sql model. Your task is to return sql query which answers
+    user's input. Please follow subquery structure if the sql needs to have multiple subqueries.
+    ###example userInput is {egUserInput}. output is {cotSubtaskOutput}. Output should be in json format as provided. Only output should be in response, nothing else.\n\n
+    """
     for idx, tableName in enumerate(prospectTablesAndCols.keys(), start=1):
+        prompt += f"table name is {tableName}, table data is {self.sampleData[tableName][prospectTablesAndCols[tableName]].head(self.gptSampleRows)}"
+    prompt += f"and table Relations are {TABLE_RELATIONS}"
+    return prompt.replace("\\"," ").replace("  "," ").replace("XXXX", "    ")