Spaces:

Express-Analytics
/

QueryHelper

Runtime error

App Files Files Community

anumaurya114exp commited on Dec 12, 2023

Commit

0ff15a5

1 Parent(s): 1d61200

new cot and with history query helper

Browse files

Files changed (5) hide show

app.py +13 -22
constants.py +37 -2
gptManager.py +12 -59
queryHelperManager.py +47 -59
queryHelperManagerCoT.py +91 -107

app.py CHANGED Viewed

@@ -35,8 +35,10 @@ selectedTablesAndCols = metadataLayout.getSelectedTablesAndCols()
 openAIClient = OpenAI(api_key=OPENAI_API_KEY)
-gptInstance = ChatgptManager(openAIClient, model=GPT_MODEL)
-queryHelper = QueryHelper(gptInstance=gptInstance,
                           schemaName=SCHEMA_NAME,platform=PLATFORM,
                           metadataLayout=metadataLayout,
                           sampleDataRows=SAMPLE_ROW_MAX,
@@ -46,8 +48,8 @@ queryHelper = QueryHelper(gptInstance=gptInstance,
 openAIClient2 = OpenAI(api_key=OPENAI_API_KEY)
-gptInstance2 = ChatgptManager(openAIClient2, model=GPT_MODEL)
-queryHelperCot = QueryHelperChainOfThought(gptInstance=gptInstance2,
                           schemaName=SCHEMA_NAME,platform=PLATFORM,
                           metadataLayout=metadataLayout,
                           sampleDataRows=SAMPLE_ROW_MAX,
@@ -68,16 +70,15 @@ def respond(message, chatHistory):
   """gpt response handler for gradio ui"""
   global queryHelper
   try:
-      botMessage, prospectTablesAndCols  = queryHelper.getQueryForUserInput(message, chatHistory)
   except Exception as e:
       errorMessage = {"function":"queryHelper.getQueryForUserInput","error":str(e), "userInput":message}
       saveLog(errorMessage, 'error')
       raise ValueError(str(e))
   queryGenerated = extractSqlFromGptResponse(botMessage)
-  logMessage = {"userInput":message, "tablesColsSelectedByGpt":str(prospectTablesAndCols) , "queryGenerated":queryGenerated, "completeGptResponse":botMessage,  "function":"queryHelper.getQueryForUserInput"}
   saveLog(logMessage)
   chatHistory.append((message, botMessage))
-  time.sleep(2)
   return "", chatHistory
 # Function to save history of chat
@@ -85,19 +86,14 @@ def respondCoT(message, chatHistory):
   """gpt response handler for gradio ui"""
   global queryHelperCot
   try:
-      if "modify" in message[:12].lower():
-        botMessage, prospectTablesAndCols  = queryHelperCot.getQueryForUserInput(message, chatHistory)
-      else:
-         botMessage, prospectTablesAndCols  = queryHelperCot.getQueryForUserInputCoT(message)
   except Exception as e:
-      errorMessage = {"function":"queryHelperFineTuned.getQueryForUserInput","error":str(e), "userInput":message}
-      saveLog(errorMessage, 'error')
-      raise ValueError(str(e))
-  queryGenerated = extractSqlFromGptResponse(botMessage)
-  logMessage = {"userInput":message, "tablesColsSelectedByGpt":str(prospectTablesAndCols) , "queryGenerated":queryGenerated, "completeGptResponse":botMessage, "function":"queryHelperCot.getQueryForUserInputCoT"}
   saveLog(logMessage)
   chatHistory.append((message, botMessage))
-  time.sleep(2)
   return "", chatHistory
@@ -131,11 +127,6 @@ def testSQL(sql):
       dbEngine2.disconnect()
       print(f"Error occured during running the query {sql}.\n and the error is {str(e)}")
-      # prompt = f"Please correct the following sql query, also it has to be run on {PLATFORM}. sql query is \n {sql}. the error occured is {str(e)}."
-      # modifiedSql = queryHelper.modifySqlQueryEnteredByUser(prompt)
-      # logMessage = {"function":"queryHelper.modifySqlQueryEnteredByUser", "sqlQuery":sql, "modifiedSQLQuery":modifiedSql}
-      # saveLog(logMessage, 'info')
       return f"The query you entered throws some error. Here is the error.\n {str(e)}"

 openAIClient = OpenAI(api_key=OPENAI_API_KEY)
+gptInstanceForTableCols = ChatgptManager(openAIClient, model=GPT_MODEL)
+gptInstanceForQuery = ChatgptManager(openAIClient, model=GPT_MODEL)
+queryHelper = QueryHelper(gptInstanceForTableCols=gptInstanceForTableCols,
+                          gptInstanceForQuery=gptInstanceForQuery,
                           schemaName=SCHEMA_NAME,platform=PLATFORM,
                           metadataLayout=metadataLayout,
                           sampleDataRows=SAMPLE_ROW_MAX,
 openAIClient2 = OpenAI(api_key=OPENAI_API_KEY)
+gptInstanceForCoT = ChatgptManager(openAIClient2, model=GPT_MODEL)
+queryHelperCot = QueryHelperChainOfThought(gptInstanceForCoT=gptInstanceForCoT,
                           schemaName=SCHEMA_NAME,platform=PLATFORM,
                           metadataLayout=metadataLayout,
                           sampleDataRows=SAMPLE_ROW_MAX,
   """gpt response handler for gradio ui"""
   global queryHelper
   try:
+      botMessage  = queryHelper.getQueryForUserInput(message)
   except Exception as e:
       errorMessage = {"function":"queryHelper.getQueryForUserInput","error":str(e), "userInput":message}
       saveLog(errorMessage, 'error')
       raise ValueError(str(e))
   queryGenerated = extractSqlFromGptResponse(botMessage)
+  logMessage = {"userInput":message, "queryGenerated":queryGenerated, "completeGptResponse":botMessage,  "function":"queryHelper.getQueryForUserInput"}
   saveLog(logMessage)
   chatHistory.append((message, botMessage))
   return "", chatHistory
 # Function to save history of chat
   """gpt response handler for gradio ui"""
   global queryHelperCot
   try:
+    botMessage  = queryHelperCot.getQueryForUserInputCoT(message)
   except Exception as e:
+    errorMessage = {"function":"queryHelperCot.getQueryForUserInput","error":str(e), "userInput":message}
+    saveLog(errorMessage, 'error')
+    raise ValueError(str(e))
+  logMessage = {"userInput":message, "completeGptResponse":botMessage, "function":"queryHelperCot.getQueryForUserInputCoT"}
   saveLog(logMessage)
   chatHistory.append((message, botMessage))
   return "", chatHistory
       dbEngine2.disconnect()
       print(f"Error occured during running the query {sql}.\n and the error is {str(e)}")
       return f"The query you entered throws some error. Here is the error.\n {str(e)}"

constants.py CHANGED Viewed

@@ -2,7 +2,7 @@ __all__ = ["SCHEMA_NAME", "GPT_SAMPLE_ROWS", "PLATFORM", "SAMPLE_ROW_MAX", "DEFA
 #Constants
 SCHEMA_NAME = "lpdatamart"
-GPT_SAMPLE_ROWS = 5
 PLATFORM = "Amazon Redshift"
 SAMPLE_ROW_MAX = 50
 QUERY_TIMEOUT = 20 #timeout in seconds
@@ -30,4 +30,39 @@ event_col = ['event_id', 'event_type', 'event_description', 'event_detail', 'sta
 DEFAULT_TABLES_COLS = {"tbl_d_customer":customer_col, "tbl_d_product":product_col, "tbl_f_sales":sales_col,
           "tbl_d_store":store_col, "tbl_d_channel":channel_col, "tbl_d_lineaction_code":lineaction_col,
                   "tbl_d_calendar":calendar_col, 'tbl_f_browse':browse_col, 'tbl_d_time': time_col, 'tbl_d_browse_action': browse_action_col,
-                  'tbl_d_browse_category':browse_category_col, 'tbl_d_style':style_col, 'tbl_f_emailing': email_col, 'tbl_d_event':event_col}

 #Constants
 SCHEMA_NAME = "lpdatamart"
+GPT_SAMPLE_ROWS = 1
 PLATFORM = "Amazon Redshift"
 SAMPLE_ROW_MAX = 50
 QUERY_TIMEOUT = 20 #timeout in seconds
 DEFAULT_TABLES_COLS = {"tbl_d_customer":customer_col, "tbl_d_product":product_col, "tbl_f_sales":sales_col,
           "tbl_d_store":store_col, "tbl_d_channel":channel_col, "tbl_d_lineaction_code":lineaction_col,
                   "tbl_d_calendar":calendar_col, 'tbl_f_browse':browse_col, 'tbl_d_time': time_col, 'tbl_d_browse_action': browse_action_col,
+                  'tbl_d_browse_category':browse_category_col, 'tbl_d_style':style_col, 'tbl_f_emailing': email_col, 'tbl_d_event':event_col}
+TABLE_RELATIONS = """tbl_d_store and tbl_f_sales on store_id
+tbl_d_time and tbl_f_sales on time_id
+tbl_d_product and tbl_f_sales on product_id
+tbl_d_channel and tbl_f_sales on channel_id
+tbl_d_customer and tbl_f_sales on customer_id
+tbl_d_source and tbl_f_sales on source_id
+tbl_d_calender and tbl_f_sales on date_id
+tbl_d_associate and tbl_f_sales on associate_id
+tbl_d_promption and tbl_f_sales on promotion_id
+tbl_d_keycode and tbl_f_sales on keycode_id
+tbl_d_lineaction_code and tbl_f_sales on tbl_d_lineaction_code.line_action_code, tbl_f_sales.line_action
+tbl_d_event and tbl_f_emailing on event_id
+tbl_d_calender and tbl_f_emailing on date_id
+tbl_d_e_sourceid and tbl_f_emailing on email_source_key
+tbl_d_time and tbl_f_emailing on time_id
+tbl_d_customer and tbl_f_emailing on customer_id
+tbl_d_email and tbl_f_email on email_key
+tbl_d_email and tbl_d_url on url_id
+tbl_f_mailing and tbl_d_calender on date_id
+tbl_d_customer and tbl_f_mailing on customer_id
+tbl_d_keycode and tbl_f_mailing on keycode_id
+tbl_d_email and tbl_f_browse on email_key
+tbl_d_calender and tbl_f_browse on date_id
+tbl_d_product and tbl_f_browse on product_id
+tbl_d_browse_action and tbl_f_browse on browse_action_id
+tbl_d_browse_style and tbl_f_browse on browse_style_id
+tbl_d_source and tbl_f_activity on source_id
+tbl_d_calender and tbl_f_activity on date_id
+tbl_d_time and tbl_f_activity on time_id
+tbl_d_customer and tbl_f_activity on customer_id
+tbl_d_customer and tbl_f_opt_out on customer_id
+tbl_d_calender and tbl_f_opt_out on date_id
+tbl_d_time and tbl_f_opt_out on time_id"""

gptManager.py CHANGED Viewed

@@ -7,40 +7,19 @@ class ChatgptManager:
     self.tokenLimit = tokenLimit
     self.model = model
     self.throwError = throwError
-  def _chatHistoryToGptMessages(self, chatHistory=[]):
-    messages = []
-    for i in range(len(chatHistory)):
-      if i%2==0:
-        message = {"role":"user", "content":chatHistory[i]}
-      else:
-        message = {"role":"assistant", "content": chatHistory[i]}
-      messages.append(message)
-    return messages
-  def getResponseForUserInput(self, userInput, systemPrompt, chatHistory=[]):
-    self.messages = self._chatHistoryToGptMessages(chatHistory[:])
-    newMessage = {"role":"system", "content":systemPrompt}
-    if not self.isTokeLimitExceeding(newMessage):
-      self.messages.append(newMessage)
     else:
-      if chatHistory==[]:
-        raise ValueError("System Prompt Too long.")
-      return self.getResponseForUserInput(userInput=userInput, systemPrompt=systemPrompt)
     userMessage = {"role":"user", "content":userInput}
-    if not self.isTokeLimitExceeding(userMessage):
-      self.messages.append(userMessage)
-    else:
-      if chatHistory==[]:
-        raise ValueError("Token Limit exceeding. With user input")
-      return self.getResponseForUserInput(userInput=userInput, systemPrompt=systemPrompt)
-    # completion = self.client.chat.completions.create(
-    #   model="gpt-3.5-turbo-1106",
-    #   messages=self.messages,
-    #   temperature=0,
-    # )
     print(self.messages, "messages being sent to gpt for completion.")
     try:
       completion = self.client.chat.completions.create(
@@ -51,34 +30,8 @@ class ChatgptManager:
       gptResponse = completion.choices[0].message.content
     except Exception as e:
       if not self.throwError:
-        gptResponse = "Error while connecting with gpt " + str(e)[:50] + "..."
     self.messages.append({"role": "assistant", "content": gptResponse})
-    return gptResponse
-  def isTokeLimitExceeding(self, newMessage=None, truncate=True, throwError=True):
-    if self.getTokenCount(newMessage=newMessage) > self.tokenLimit:
-      return True
-    return False
-  def getTokenCount(self, newMessage=None):
-    """Token count including new Message"""
-    def getWordsCount(text):
-      return len(re.findall(r'\b\w+\b', text))
-    messages = self.messages[:]
-    if newMessage!=None:
-      messages.append(newMessage)
-    if len(messages)!=0:
-      combinedContent = " ".join([str(msg["content"]) for msg in messages])
-    else:
-      combinedContent = ""
-    currentTokensInMessages = getWordsCount(combinedContent)
-    return currentTokensInMessages

     self.tokenLimit = tokenLimit
     self.model = model
     self.throwError = throwError
+    self.messages = []
+  def setSystemPrompt(self, systemPrompt):
+    systemMessage = {"role":"system", "content":systemPrompt}
+    if len(self.messages)==0:
+      self.messages = [systemMessage]
     else:
+      del self.messages[0]
+      self.messages.insert(0, systemMessage)
+  def getResponseForUserInput(self, userInput):
     userMessage = {"role":"user", "content":userInput}
+    self.messages.append(userMessage)
     print(self.messages, "messages being sent to gpt for completion.")
     try:
       completion = self.client.chat.completions.create(
       gptResponse = completion.choices[0].message.content
     except Exception as e:
       if not self.throwError:
+        errorText = "Error while connecting with gpt " + str(e)[:100] + "..."
+        return errorText
     self.messages.append({"role": "assistant", "content": gptResponse})
+    return gptResponse

queryHelperManager.py CHANGED Viewed

@@ -1,11 +1,16 @@
 from gptManager import ChatgptManager
 from utils import *
 class QueryHelper:
-  def __init__(self, gptInstance: ChatgptManager, dbEngine, schemaName,
                platform, metadataLayout: MetaDataLayout, sampleDataRows,
-               gptSampleRows, getSampleDataForTablesAndCols):
-    self.gptInstance = gptInstance
     self.schemaName = schemaName
     self.platform = platform
     self.metadataLayout = metadataLayout
@@ -13,6 +18,7 @@ class QueryHelper:
     self.gptSampleRows = gptSampleRows
     self.getSampleDataForTablesAndCols = getSampleDataForTablesAndCols
     self.dbEngine = dbEngine
     self._onMetadataChange()
   def _onMetadataChange(self):
@@ -20,10 +26,12 @@ class QueryHelper:
     sampleDataRows = self.sampleDataRows
     dbEngine = self.dbEngine
     schemaName = self.schemaName
     selectedTablesAndCols = metadataLayout.getSelectedTablesAndCols()
     self.sampleData = self.getSampleDataForTablesAndCols(dbEngine=dbEngine,schemaName=schemaName,
                                                          tablesAndCols=selectedTablesAndCols, maxRows=sampleDataRows)
   def getMetadata(self) -> MetaDataLayout :
     return self.metadataLayout
@@ -31,52 +39,43 @@ class QueryHelper:
   def updateMetadata(self, metadataLayout):
     self.metadataLayout = metadataLayout
     self._onMetadataChange()
-  def modifySqlQueryEnteredByUser(self, userSqlQuery):
-    platform = self.platform
-    userPrompt = f"Please correct the following sql query, also it has to be run on {platform}. sql query is \n {userSqlQuery}."
-    systemPrompt = ""
-    modifiedSql = self.gptInstance.getResponseForUserInput(userPrompt, systemPrompt)
-    return modifiedSql
-  def filteredSampleDataForProspects(self, prospectTablesAndCols):
-    sampleData = self.sampleData
-    filteredData = {}
-    for table in prospectTablesAndCols.keys():
-      # filteredData[table] = sampleData[table][prospectTablesAndCols[table]]
-      #take all columns of prospects
-      filteredData[table] = sampleData[table][prospectTablesAndCols[table]]
-    return filteredData
-  def getQueryForUserInput(self, userInput, chatHistory=[]):
-    gptSampleRows = self.gptSampleRows
     selectedTablesAndCols = self.metadataLayout.getSelectedTablesAndCols()
-    prospectTablesAndCols = self.getProspectiveTablesAndCols(userInput, selectedTablesAndCols, chatHistory)
-    print("getting prospects", prospectTablesAndCols)
-    prospectTablesData = self.filteredSampleDataForProspects(prospectTablesAndCols)
-    systemPromptForQueryGeneration = self.getSystemPromptForQueryGeneration(prospectTablesData, gptSampleRows=gptSampleRows)
-    queryByGpt = self.gptInstance.getResponseForUserInput(userInput, systemPromptForQueryGeneration, chatHistory)
-    queryByGpt = preProcessGptQueryReponse(queryByGpt, metadataLayout=self.metadataLayout)
-    return queryByGpt, prospectTablesAndCols
-  def getProspectiveTablesAndCols(self, userInput, selectedTablesAndCols, chatHistory=[]):
     schemaName = self.schemaName
-    systemPromptForProspectColumns = self.getSystemPromptForProspectColumns(selectedTablesAndCols)
-    prospectiveTablesColsText = self.gptInstance.getResponseForUserInput(userInput, systemPromptForProspectColumns, chatHistory)
-    prospectTablesAndCols = {}
-    for table in selectedTablesAndCols.keys():
-      if table in prospectiveTablesColsText:
-        prospectTablesAndCols[table] = []
-        for column in selectedTablesAndCols[table]:
-          if column in prospectiveTablesColsText:
-            prospectTablesAndCols[table].append(column)
-    return prospectTablesAndCols
-  def getSystemPromptForQueryGeneration(self, prospectTablesData, gptSampleRows):
     schemaName = self.schemaName
     platform = self.platform
     exampleQuery = """SELECT a.customer_id, COUNT(a.product_id) as chandelier_count
 FROM lpdatamart.tbl_f_sales a
 JOIN lpdatamart.tbl_d_product b ON a.product_id = b.product_id
@@ -86,20 +85,9 @@ GROUP BY a.customer_id
 ORDER BY chandelier_count DESC"""
     question = "top 5 customers who bought most chandeliers in nov 2023"
-    prompt = f"""Given an input text, generate the corresponding SQL query for given details. Schema Name is {schemaName}. And sql platform is {platform}.\n following is sample data. Also
-    There is example user input and desired generated sql query. Follow similar patterns as example. eg case insensitive, explicit variable declaration etc. user input : {question}, query : {exampleQuery} """
-    for idx, tableName in enumerate(prospectTablesData.keys(), start=1):
-        prompt += f"table name is {tableName}, table data is {prospectTablesData[tableName].head(gptSampleRows)}"
-    prompt += "XXXX"
-    return prompt.replace("\n"," ").replace("\\"," ").replace("  "," ").replace("XXXX", "    ")
-  def getSystemPromptForProspectColumns(self, selectedTablesAndCols):
-    schemaName = self.schemaName
-    platform = self.platform
-    prompt = f"""Given an input text, User wants to know which all tables and columns would be possibily to have the desired data. Output them as json. Schema Name is {schemaName}. And sql platform is {platform}.\n"""
-    for idx, tableName in enumerate(selectedTablesAndCols.keys(), start=1):
-        prompt += f"table name {tableName} {', '.join(selectedTablesAndCols[tableName])}"
-    prompt += "XXXX"
-    return prompt.replace("\n"," ").replace("\\"," ").replace("  "," ").replace("XXXX", "    ")

 from gptManager import ChatgptManager
 from utils import *
+import json
+from constants import TABLE_RELATIONS
 class QueryHelper:
+  def __init__(self, gptInstanceForTableCols: ChatgptManager,
+               gptInstanceForQuery: ChatgptManager,
+               dbEngine, schemaName,
                platform, metadataLayout: MetaDataLayout, sampleDataRows,
+               gptSampleRows, getSampleDataForTablesAndCols, tableSummaryJson='tableSummaryDict.json'):
+    self.gptInstanceForTableCols = gptInstanceForTableCols
+    self.gptInstanceForQuery = gptInstanceForQuery
     self.schemaName = schemaName
     self.platform = platform
     self.metadataLayout = metadataLayout
     self.gptSampleRows = gptSampleRows
     self.getSampleDataForTablesAndCols = getSampleDataForTablesAndCols
     self.dbEngine = dbEngine
+    self.tableSummaryJson = tableSummaryJson
     self._onMetadataChange()
   def _onMetadataChange(self):
     sampleDataRows = self.sampleDataRows
     dbEngine = self.dbEngine
     schemaName = self.schemaName
     selectedTablesAndCols = metadataLayout.getSelectedTablesAndCols()
     self.sampleData = self.getSampleDataForTablesAndCols(dbEngine=dbEngine,schemaName=schemaName,
                                                          tablesAndCols=selectedTablesAndCols, maxRows=sampleDataRows)
+    self.promptTableColsInfo = self.getSystemPromptForTableCols()
+    self.gptInstanceForTableCols.setSystemPrompt(self.promptTableColsInfo)
   def getMetadata(self) -> MetaDataLayout :
     return self.metadataLayout
   def updateMetadata(self, metadataLayout):
     self.metadataLayout = metadataLayout
     self._onMetadataChange()
+  def getQueryForUserInput(self, userInput):
+    prospectTablesAndColsText = self.gptInstanceForTableCols.getResponseForUserInput(userInput)
     selectedTablesAndCols = self.metadataLayout.getSelectedTablesAndCols()
+    prospectTablesAndCols = dict()
+    for table in selectedTablesAndCols:
+      if table in prospectTablesAndColsText:
+        prospectTablesAndCols[table] = []
+        for col in selectedTablesAndCols[table]:
+          if col in prospectTablesAndColsText:
+            prospectTablesAndCols[table].append(col)
+    promptForQuery = getSystemPromptForQuery(prospectTablesAndCols)
+    self.gptInstanceForQuery.setSystemPrompt(promptForQuery)
+    gptResponse = self.gptInstanceForQuery.getResponseForUserInput(userInput)
+    return gptResponse
+  def getSystemPromptForTableCols(self):
     schemaName = self.schemaName
+    platform = self.platform
+    tableSummaryDict = json.load(self.tableSummaryJson)
+    selectedTablesAndCols = self.metadataLayout.getSelectedTablesAndCols()
+    promptTableInfo = f"""You are a powerful text to sql model. Answer which tables and columns are needed
+    to answer user input using sql query. and following are tables and columns info. and example user input and result query."""
+    for idx, tableName in enumerate(selectedTablesAndCols.keys(), start=1):
+        promptTableInfo += f"table name {tableName} and summary is {tableSummaryDict[tableName]}"
+        promptTableInfo += f" and columns {', '.join(selectedTablesAndCols[tableName])} \n"
+    promptTableInfo += "XXXX"
+    #Join statements
+    promptTableInfo += f"and table Relations are {TABLE_RELATIONS}"
+    return promptTableInfo
+  def getSystemPromptForQuery(self, prospectTablesAndCols):
     schemaName = self.schemaName
     platform = self.platform
+    tableSummaryDict = json.load(self.tableSummaryJson)
     exampleQuery = """SELECT a.customer_id, COUNT(a.product_id) as chandelier_count
 FROM lpdatamart.tbl_f_sales a
 JOIN lpdatamart.tbl_d_product b ON a.product_id = b.product_id
 ORDER BY chandelier_count DESC"""
     question = "top 5 customers who bought most chandeliers in nov 2023"
+    promptForQuery = f"""You are a powerful text to sql model. Answer user input with sql query. And the query needs to run on {platform}. and schemaName is {schemaName}. There is example user input and desired generated sql query. Follow similar patterns as example. eg case insensitive, explicit variable declaration etc. user input : {question}, query : {exampleQuery}. and table's data is \n"""
+    for idx, tableName in enumerate(prospectTablesAndCols.keys(), start=1):
+        promptForQuery += f"table name is {tableName}, table data is {self.sampleData[tableName][prospectTablesAndCols[tableName]].head(gptSampleRows)}"
+    promptForQuery += f"and table Relations are {TABLE_RELATIONS}"
+    return promptForQuery.replace("\\"," ").replace("  "," ").replace("XXXX", "    ")

queryHelperManagerCoT.py CHANGED Viewed

@@ -1,13 +1,14 @@
 from gptManager import ChatgptManager
 from utils import *
-import re
-import json
-class QueryHelperChainOfThought:
-  def __init__(self, gptInstance: ChatgptManager, dbEngine, schemaName,
                platform, metadataLayout: MetaDataLayout, sampleDataRows,
-               gptSampleRows, getSampleDataForTablesAndCols):
-    self.gptInstance = gptInstance
     self.schemaName = schemaName
     self.platform = platform
     self.metadataLayout = metadataLayout
@@ -15,6 +16,7 @@ class QueryHelperChainOfThought:
     self.gptSampleRows = gptSampleRows
     self.getSampleDataForTablesAndCols = getSampleDataForTablesAndCols
     self.dbEngine = dbEngine
     self._onMetadataChange()
   def _onMetadataChange(self):
@@ -22,10 +24,10 @@ class QueryHelperChainOfThought:
     sampleDataRows = self.sampleDataRows
     dbEngine = self.dbEngine
     schemaName = self.schemaName
     selectedTablesAndCols = metadataLayout.getSelectedTablesAndCols()
     self.sampleData = self.getSampleDataForTablesAndCols(dbEngine=dbEngine,schemaName=schemaName,
                                                          tablesAndCols=selectedTablesAndCols, maxRows=sampleDataRows)
   def getMetadata(self) -> MetaDataLayout :
     return self.metadataLayout
@@ -33,91 +35,93 @@ class QueryHelperChainOfThought:
   def updateMetadata(self, metadataLayout):
     self.metadataLayout = metadataLayout
     self._onMetadataChange()
-  def modifySqlQueryEnteredByUser(self, userSqlQuery):
-    platform = self.platform
-    userPrompt = f"Please correct the following sql query, also it has to be run on {platform}. sql query is \n {userSqlQuery}."
-    systemPrompt = ""
-    modifiedSql = self.gptInstance.getResponseForUserInput(userPrompt, systemPrompt)
-    return modifiedSql
-  def filteredSampleDataForProspects(self, prospectTablesAndCols):
-    sampleData = self.sampleData
-    filteredData = {}
-    for table in prospectTablesAndCols.keys():
-      # filteredData[table] = sampleData[table][prospectTablesAndCols[table]]
-      #take all columns of prospects
-      filteredData[table] = sampleData[table][prospectTablesAndCols[table]]
-    return filteredData
-  def extractSingleJson(self, text):
-    pattern = r'\{.*?\}'
-    matches = re.findall(pattern, text, re.DOTALL)
-    extracted_json = [json.loads(match) for match in matches][0]
-    return extracted_json
   def getQueryForUserInputCoT(self, userInput):
-    selectedTablesAndCols = self.metadataLayout.getSelectedTablesAndCols()
-    systemPromptTail = self.getSystemPromptTailForCoTStep1(selectedTablesAndCols)
-    #1. Is the input complete to create a query, or ask user to reask with more detailed input
-    systemPromptForInputClarification = """Given an input text, user want to generate sql query. Please answer if the user input is complete or user needs to ask in more detailed way. Answer in following format. 'Yes' ; if yes, break the userinput into smaller subtask for query generation. Formatted into
-        {
-        "Task 1": "task 1 description",
-        "Task 2": "task 2 description"
-        }
-'No' ; if no, then Reason- please be more detailed about customer details; if more modification needed."""
-    systemPromptForInputClarification = systemPromptForInputClarification + '\n' + systemPromptTail
-    cotStep1 = self.gptInstance.getResponseForUserInput(userInput, systemPromptForInputClarification)
-    if "yes" in cotStep1.lower()[:5]:
-      print("User input sufficient")
-      tasks = self.extractSingleJson(cotStep1)
-      print(f"tasks are {tasks}")
-      taskQueries = {}
-      prospectTablesAndColsAll = []
-      for key, task in tasks.items():
-        taskQuery, prospectTablesAndCols = self.getQueryForUserInput(userInput)
-        taskQueries[key] = {"task":task, "taskQuery":taskQuery}
-        prospectTablesAndColsAll.append(prospectTablesAndCols)
-      print(f"tasks and their queries {taskQueries}")
-      combiningSubtasksQueryPrompt = f"""Combine following subtask and their queries to generate sql query to answer the user input.\n """
-      userPrompt = f"user input is {userInput}"
-      for key in taskQueries.keys():
-        task = taskQueries[key]["task"]
-        query = taskQueries[key]["taskQuery"]
-        userPrompt += f" task: {task}, task query: {query}"
-      return self.gptInstance.getResponseForUserInput(userPrompt, combiningSubtasksQueryPrompt), prospectTablesAndColsAll
-    return f"Please rephrase your query. {' '.join(cotStep1.split('Reason')[1:])}", None
-  def getQueryForUserInput(self, userInput, chatHistory=[]):
-    gptSampleRows = self.gptSampleRows
     selectedTablesAndCols = self.metadataLayout.getSelectedTablesAndCols()
-    prospectTablesAndCols = self.getProspectiveTablesAndCols(userInput, selectedTablesAndCols, chatHistory)
-    print("getting prospects", prospectTablesAndCols)
-    prospectTablesData = self.filteredSampleDataForProspects(prospectTablesAndCols)
-    systemPromptForQueryGeneration = self.getSystemPromptForQueryGeneration(prospectTablesData, gptSampleRows=gptSampleRows)
-    queryByGpt = self.gptInstance.getResponseForUserInput(userInput, systemPromptForQueryGeneration, chatHistory)
-    queryByGpt = preProcessGptQueryReponse(queryByGpt, metadataLayout=self.metadataLayout)
-    return queryByGpt, prospectTablesAndCols
-  def getProspectiveTablesAndCols(self, userInput, selectedTablesAndCols, chatHistory=[]):
     schemaName = self.schemaName
-    systemPromptForProspectColumns = self.getSystemPromptForProspectColumns(selectedTablesAndCols)
-    prospectiveTablesColsText = self.gptInstance.getResponseForUserInput(userInput, systemPromptForProspectColumns, chatHistory)
-    prospectTablesAndCols = {}
-    for table in selectedTablesAndCols.keys():
-      if table in prospectiveTablesColsText:
-        prospectTablesAndCols[table] = []
-        for column in selectedTablesAndCols[table]:
-          if column in prospectiveTablesColsText:
-            prospectTablesAndCols[table].append(column)
-    return prospectTablesAndCols
-  def getSystemPromptForQueryGeneration(self, prospectTablesData, gptSampleRows):
     schemaName = self.schemaName
     platform = self.platform
     exampleQuery = """SELECT a.customer_id, COUNT(a.product_id) as chandelier_count
 FROM lpdatamart.tbl_f_sales a
 JOIN lpdatamart.tbl_d_product b ON a.product_id = b.product_id
@@ -127,29 +131,9 @@ GROUP BY a.customer_id
 ORDER BY chandelier_count DESC"""
     question = "top 5 customers who bought most chandeliers in nov 2023"
-    prompt = f"""Given an input text, generate the corresponding SQL query for given details. Schema Name is {schemaName}. And sql platform is {platform}.\n following is sample data. Also
-    There is example user input and desired generated sql query. Follow similar patterns as example. eg case insensitive, explicit variable declaration etc. user input : {question}, query : {exampleQuery} """
-    for idx, tableName in enumerate(prospectTablesData.keys(), start=1):
-        prompt += f"table name is {tableName}, table data is {prospectTablesData[tableName].head(gptSampleRows)}"
-    prompt += "XXXX"
-    return prompt.replace("\n"," ").replace("\\"," ").replace("  "," ").replace("XXXX", "    ")
-  def getSystemPromptForProspectColumns(self, selectedTablesAndCols):
-    schemaName = self.schemaName
-    platform = self.platform
-    prompt = f"""Given an input text, User wants to know which all tables and columns would be possibily to have the desired data. Output them as json. Schema Name is {schemaName}. And sql platform is {platform}.\n"""
-    for idx, tableName in enumerate(selectedTablesAndCols.keys(), start=1):
-        prompt += f"table name {tableName} {', '.join(selectedTablesAndCols[tableName])}"
-    prompt += "XXXX"
-    return prompt.replace("\n"," ").replace("\\"," ").replace("  "," ").replace("XXXX", "    ")
-  def getSystemPromptTailForCoTStep1(self, selectedTablesAndCols):
-    schemaName = self.schemaName
-    platform = self.platform
-    prompt = f"""schema name is {schemaName}. And sql platform is {platform}. and table info are below.\n"""
-    for idx, tableName in enumerate(selectedTablesAndCols.keys(), start=1):
-        prompt += f"table name {tableName} {', '.join(selectedTablesAndCols[tableName])}"
-    prompt += "XXXX"
-    return prompt.replace("\n"," ").replace("\\"," ").replace("  "," ").replace("XXXX", "    ")

 from gptManager import ChatgptManager
 from utils import *
+import json
+from constants import TABLE_RELATIONS
+class QueryHelper:
+  def __init__(self, gptInstanceForCoT: ChatgptManager,
+               dbEngine, schemaName,
                platform, metadataLayout: MetaDataLayout, sampleDataRows,
+               gptSampleRows, getSampleDataForTablesAndCols, tableSummaryJson='tableSummaryDict.json'):
+    self.gptInstanceForCoT = gptInstanceForCoT
     self.schemaName = schemaName
     self.platform = platform
     self.metadataLayout = metadataLayout
     self.gptSampleRows = gptSampleRows
     self.getSampleDataForTablesAndCols = getSampleDataForTablesAndCols
     self.dbEngine = dbEngine
+    self.tableSummaryJson = tableSummaryJson
     self._onMetadataChange()
   def _onMetadataChange(self):
     sampleDataRows = self.sampleDataRows
     dbEngine = self.dbEngine
     schemaName = self.schemaName
     selectedTablesAndCols = metadataLayout.getSelectedTablesAndCols()
     self.sampleData = self.getSampleDataForTablesAndCols(dbEngine=dbEngine,schemaName=schemaName,
                                                          tablesAndCols=selectedTablesAndCols, maxRows=sampleDataRows)
   def getMetadata(self) -> MetaDataLayout :
     return self.metadataLayout
   def updateMetadata(self, metadataLayout):
     self.metadataLayout = metadataLayout
     self._onMetadataChange()
   def getQueryForUserInputCoT(self, userInput):
+    prompt = self.getPromptForCot()
+    self.gptInstanceForCot.setSystemPrompt(userInput)
+    gptResponse = self.gptInstanceForCoT.getResponseForUserInput(userInput)
+    return gptResponse
+  def getPromptForCot(self):
+    schemaName = self.schemaName
+    platform = self.platform
+    tableSummaryDict = json.load(self.tableSummaryJson)
     selectedTablesAndCols = self.metadataLayout.getSelectedTablesAndCols()
+    egUserInput = "I want to get top 5 product categories by state, then rank categories on decreasing order of total sales"
+    cotSubtaskOutput = """{
+          "subquery1": {
+            "inputSubquery": [],
+            "descriptioin":"calculate the total sales and assigns ranks to product categories within each state based on the descending order of sales in the tbl_f_sales table, utilizing joins with tbl_d_product and tbl_d_customer tables.",
+            "result": "SELECT c.state, b.category, SUM(a.transaction_amount) as total_sales,
+    RANK() OVER(PARTITION BY c.state ORDER BY SUM(a.transaction_amount) DESC) as category_rank
+    FROM lpdatamart.tbl_f_sales a
+    JOIN lpdatamart.tbl_d_product b
+    ON a.product_id = b.product_id
+    JOIN lpdatamart.tbl_d_customer c
+    ON a.customer_id = c.customer_id
+    GROUP BY c.state, b.category "
+          },
+          "subquery2": {
+            "inputSubquery": ["subquery1"],
+            "description":"extracts state, category, and total sales information from a subquery named "subquery1," filtering the results to include only categories with ranks up to 5 and sorting them by state and category rank."
+            "result":"SELECT state, category, total_sales
+FROM ranked_categories
+WHERE category_rank <= 5
+ORDER BY state, category_rank"
+          },
+          "finalResult":"WITH subquery1 AS (
+    SELECT c.state, b.category, SUM(a.transaction_amount) as total_sales,
+    RANK() OVER(PARTITION BY c.state ORDER BY SUM(a.transaction_amount) DESC) as category_rank
+    FROM lpdatamart.tbl_f_sales a
+    JOIN lpdatamart.tbl_d_product b
+    ON a.product_id = b.product_id
+    JOIN lpdatamart.tbl_d_customer c
+    ON a.customer_id = c.customer_id
+    GROUP BY c.state, b.category
+)
+SELECT state, category, total_sales
+FROM subquery1
+WHERE category_rank <= 5
+ORDER BY state, category_rank"
+        }"""
+    promptTableInfo = self.getSystemPromptForTableCols()
+    selectedTablesAndCols = self.metadataLayout.getSelectedTablesAndCols()
+    promptColumnsInfo = getSystemPromptForQuery(selectedTablesAndCols)
+    prompt = f"""You are a powerful text to sql model. Your task is to return sql query which answers
+    user's input. Please follow subquery structure if the sql needs to have multiple subqueries.
+    ###example userInput {egUserInput}. output {cotSubtaskStructure}
+    tables information are {promptTableInfo}.
+    columns data are {promptColumnsInfo}.
+    """
+    prompt += f"and table Relations are {TABLE_RELATIONS}"
+    return prompt
+  def getSystemPromptForTableCols(self):
     schemaName = self.schemaName
+    platform = self.platform
+    tableSummaryDict = json.load(self.tableSummaryJson)
+    selectedTablesAndCols = self.metadataLayout.getSelectedTablesAndCols()
+    promptTableInfo = f"""You are a powerful text to sql model. Answer which tables and columns are needed
+    to answer user input using sql query. and following are tables and columns info. and example user input and result query."""
+    for idx, tableName in enumerate(selectedTablesAndCols.keys(), start=1):
+        promptTableInfo += f"table name {tableName} and summary is {tableSummaryDict[tableName]}"
+        promptTableInfo += f" and columns {', '.join(selectedTablesAndCols[tableName])} \n"
+    promptTableInfo += "XXXX"
+    #Join statements
+    promptTableInfo += f"and table Relations are {TABLE_RELATIONS}"
+    return promptTableInfo
+  def getSystemPromptForQuery(self, prospectTablesAndCols):
     schemaName = self.schemaName
     platform = self.platform
+    tableSummaryDict = json.load(self.tableSummaryJson)
     exampleQuery = """SELECT a.customer_id, COUNT(a.product_id) as chandelier_count
 FROM lpdatamart.tbl_f_sales a
 JOIN lpdatamart.tbl_d_product b ON a.product_id = b.product_id
 ORDER BY chandelier_count DESC"""
     question = "top 5 customers who bought most chandeliers in nov 2023"
+    promptForQuery = f"""You are a powerful text to sql model. Answer user input with sql query. And the query needs to run on {platform}. and schemaName is {schemaName}. There is example user input and desired generated sql query. Follow similar patterns as example. eg case insensitive, explicit variable declaration etc. user input : {question}, query : {exampleQuery}. and table's data is \n"""
+    for idx, tableName in enumerate(prospectTablesAndCols.keys(), start=1):
+        promptForQuery += f"table name is {tableName}, table data is {self.sampleData[tableName][prospectTablesAndCols[tableName]].head(gptSampleRows)}"
+    promptForQuery += f"and table Relations are {TABLE_RELATIONS}"
+    return promptForQuery.replace("\\"," ").replace("  "," ").replace("XXXX", "    ")