Spaces:

arithescientist
/

GenBIChatbot

Sleeping

App Files Files Community

arithescientist commited on Oct 1, 2024

Commit

f0e4f1b

verified ·

1 Parent(s): d0ab6a9

Update app.py

Browse files

Files changed (1) hide show

app.py +137 -183

app.py CHANGED Viewed

@@ -2,16 +2,31 @@ import os
 import streamlit as st
 import pandas as pd
 import sqlite3
-from langchain import OpenAI, LLMChain, PromptTemplate
-import sqlparse
 import logging
 # Initialize conversation history
 if 'history' not in st.session_state:
     st.session_state.history = []
 # OpenAI API key (ensure it is securely stored)
-# You can set the API key in your environment variables or a .env file
 openai_api_key = os.getenv("OPENAI_API_KEY")
 # Check if the API key is set
@@ -20,7 +35,7 @@ if not openai_api_key:
     st.stop()
 # Step 1: Upload CSV data file (or use default)
-st.title("Natural Language to SQL Query App with Enhanced Insights")
 st.write("Upload a CSV file to get started, or use the default dataset.")
 csv_file = st.file_uploader("Upload your CSV file", type=["csv"])
@@ -43,117 +58,64 @@ data.to_sql(table_name, conn, index=False, if_exists='replace')
 valid_columns = list(data.columns)
 st.write(f"Valid columns: {valid_columns}")
-# Step 3: Set up the LLM Chains
-# SQL Generation Chain
-sql_template = """
-You are an expert data scientist. Given a natural language question, the name of the table, and a list of valid columns, generate a valid SQL query that answers the question.
-Ensure that:
-- You only use the columns provided.
-- When performing string comparisons in the WHERE clause, make them case-insensitive by using 'COLLATE NOCASE' or the LOWER() function.
-- Do not use 'COLLATE NOCASE' in ORDER BY clauses unless sorting a string column.
-- Do not apply 'COLLATE NOCASE' to numeric columns.
-If the question is vague or open-ended and does not pertain to specific data retrieval, respond with "NO_SQL" to indicate that a SQL query should not be generated.
-Question: {question}
-Table name: {table_name}
-Valid columns: {columns}
-SQL Query:
 """
-sql_prompt = PromptTemplate(template=sql_template, input_variables=['question', 'table_name', 'columns'])
-llm = OpenAI(temperature=0, openai_api_key=openai_api_key, max_tokens = 180)
-sql_generation_chain = LLMChain(llm=llm, prompt=sql_prompt)
-# Insights Generation Chain
-insights_template =  """
-You are an expert data scientist. Based on the user's question and the SQL query result provided below, generate a concise analysis that includes key data insights and actionable recommendations. Limit the response to a maximum of 150 words.
-User's Question: {question}
-SQL Query Result:
-{result}
-Concise Analysis (max 200 words):
-"""
-insights_prompt = PromptTemplate(template=insights_template, input_variables=['question', 'result'])
-insights_chain = LLMChain(llm=llm, prompt=insights_prompt)
-# General Insights and Recommendations Chain
-general_insights_template = """
-You are an expert data scientist. Based on the entire dataset provided below, generate a concise analysis with key insights and recommendations. Limit the response to 150 words.
-Dataset Summary:
-{dataset_summary}
-Concise Analysis and Recommendations (max 150 words):
-"""
-general_insights_prompt = PromptTemplate(template=general_insights_template, input_variables=['dataset_summary'])
-general_insights_chain = LLMChain(llm=llm, prompt=general_insights_prompt)
-# Optional: Clean up function to remove incorrect COLLATE NOCASE usage
-def clean_sql_query(query):
-    """Removes incorrect usage of COLLATE NOCASE from the SQL query."""
-    parsed = sqlparse.parse(query)
-    statements = []
-    for stmt in parsed:
-        tokens = []
-        idx = 0
-        while idx < len(stmt.tokens):
-            token = stmt.tokens[idx]
-            if (token.ttype is sqlparse.tokens.Keyword and token.value.upper() == 'COLLATE'):
-                # Check if the next token is 'NOCASE'
-                next_token = stmt.tokens[idx + 2] if idx + 2 < len(stmt.tokens) else None
-                if next_token and next_token.value.upper() == 'NOCASE':
-                    # Skip 'COLLATE' and 'NOCASE' tokens
-                    idx += 3  # Skip 'COLLATE', whitespace, 'NOCASE'
-                    continue
-            tokens.append(token)
-            idx += 1
-        statements.append(''.join([str(t) for t in tokens]))
-    return ' '.join(statements)
-# Function to classify user query
-def classify_query(question):
-    """Classify the user query as either 'SQL' or 'INSIGHTS'."""
-    classification_template = """
-    You are an AI assistant that classifies user queries into two categories: 'SQL' for specific data retrieval queries and 'INSIGHTS' for general analytical or recommendation queries.
-    Determine the appropriate category for the following user question.
-    Question: "{question}"
-    Category (SQL/INSIGHTS):
-    """
-    classification_prompt = PromptTemplate(template=classification_template, input_variables=['question'])
-    classification_chain = LLMChain(llm=llm, prompt=classification_prompt)
-    category = classification_chain.run({'question': question}).strip().upper()
-    if category.startswith('SQL'):
-        return 'SQL'
-    else:
-        return 'INSIGHTS'
-# Function to generate dataset summary
-def generate_dataset_summary(data):
-    """Generate a summary of the dataset for general insights."""
-    summary_template = """
-    You are an expert data scientist. Based on the dataset provided below, generate a concise summary that includes the number of records, number of columns, data types, and any notable features.
-    Dataset:
-    {data}
-    Dataset Summary:
-    """
-    summary_prompt = PromptTemplate(template=summary_template, input_variables=['data'])
-    summary_chain = LLMChain(llm=llm, prompt=summary_prompt)
-    summary = summary_chain.run({'data': data.head().to_string(index=False)})
-    return summary
-# Define the callback function
 def process_input():
     user_prompt = st.session_state['user_input']
@@ -162,77 +124,69 @@ def process_input():
             # Append user message to history
             st.session_state.history.append({"role": "user", "content": user_prompt})
-            # Classify the user query
-            category = classify_query(user_prompt)
-            logging.info(f"User query classified as: {category}")
-            if "COLUMNS" in user_prompt.upper():
-                assistant_response = f"The columns are: {', '.join(valid_columns)}"
                 st.session_state.history.append({"role": "assistant", "content": assistant_response})
-            elif category == 'SQL':
-                columns = ', '.join(valid_columns)
-                generated_sql = sql_generation_chain.run({
-                    'question': user_prompt,
-                    'table_name': table_name,
-                    'columns': columns
-                }).strip()
-                if generated_sql.upper() == "NO_SQL":
-                    # Handle cases where no SQL should be generated
-                    assistant_response = "Sure, let's discuss some general insights and recommendations based on the data."
-                    # Generate dataset summary
-                    dataset_summary = generate_dataset_summary(data)
-                    # Generate general insights and recommendations
-                    general_insights = general_insights_chain.run({
-                        'dataset_summary': dataset_summary
-                    })
-                    # Append the assistant's insights to the history
-                    st.session_state.history.append({"role": "assistant", "content": general_insights})
                 else:
-                    # Clean the SQL query
-                    cleaned_sql = clean_sql_query(generated_sql)
-                    logging.info(f"Generated SQL Query: {cleaned_sql}")
-                    # Attempt to execute SQL query and handle exceptions
-                    try:
-                        result = pd.read_sql_query(cleaned_sql, conn)
-                        if result.empty:
-                            assistant_response = "The query returned no results. Please try a different question."
-                            st.session_state.history.append({"role": "assistant", "content": assistant_response})
-                        else:
-                            # Convert the result to a string for the insights prompt
-                            result_str = result.head(10).to_string(index=False)  # Limit to first 10 rows
-                            # Generate insights and recommendations based on the query result
-                            insights = insights_chain.run({
-                                'question': user_prompt,
-                                'result': result_str
-                            })
-                            # Append the assistant's insights to the history
-                            st.session_state.history.append({"role": "assistant", "content": insights})
-                            # Append the result DataFrame to the history
-                            st.session_state.history.append({"role": "assistant", "content": result})
-                    except Exception as e:
-                        logging.error(f"An error occurred during SQL execution: {e}")
-                        assistant_response = f"Error executing SQL query: {e}"
-                        st.session_state.history.append({"role": "assistant", "content": assistant_response})
-            else:  # INSIGHTS category
-                # Generate dataset summary
-                dataset_summary = generate_dataset_summary(data)
-                # Generate general insights and recommendations
-                general_insights = general_insights_chain.run({
-                    'dataset_summary': dataset_summary
-                })
-                # Append the assistant's insights to the history
-                st.session_state.history.append({"role": "assistant", "content": general_insights})
         except Exception as e:
             logging.error(f"An error occurred: {e}")
             assistant_response = f"Error: {e}"
@@ -241,7 +195,7 @@ def process_input():
         # Reset the user_input in session state
         st.session_state['user_input'] = ''
-# Display the conversation history
 for message in st.session_state.history:
     if message['role'] == 'user':
         st.markdown(f"**User:** {message['content']}")
@@ -253,4 +207,4 @@ for message in st.session_state.history:
             st.markdown(f"**Assistant:** {message['content']}")
 # Place the input field at the bottom with the callback
-st.text_input("Enter your message:", key='user_input', on_change=process_input)

 import streamlit as st
 import pandas as pd
 import sqlite3
 import logging
+from langchain.agents import create_sql_agent
+from langchain.agents.agent_toolkits import SQLDatabaseToolkit
+from langchain.llms import OpenAI
+from langchain.sql_database import SQLDatabase
+from langchain.prompts import (
+    ChatPromptTemplate,
+    FewShotPromptTemplate,
+    PromptTemplate,
+    SystemMessagePromptTemplate,
+    HumanMessagePromptTemplate,
+    MessagesPlaceholder
+)
+from langchain.schema import HumanMessage
+from langchain.chat_models import ChatOpenAI
+from langchain.evaluation import load_evaluator
+# Initialize logging
+logging.basicConfig(level=logging.INFO)
 # Initialize conversation history
 if 'history' not in st.session_state:
     st.session_state.history = []
 # OpenAI API key (ensure it is securely stored)
 openai_api_key = os.getenv("OPENAI_API_KEY")
 # Check if the API key is set
     st.stop()
 # Step 1: Upload CSV data file (or use default)
+st.title("Enhanced Natural Language to SQL Query App")
 st.write("Upload a CSV file to get started, or use the default dataset.")
 csv_file = st.file_uploader("Upload your CSV file", type=["csv"])
 valid_columns = list(data.columns)
 st.write(f"Valid columns: {valid_columns}")
+# Create SQLDatabase instance with custom table info
+engine = SQLDatabase.from_uri(f"sqlite:///{db_file}", include_tables=[table_name])
+# Step 3: Define the few-shot examples for the prompt
+few_shot_examples = [
+    {
+        "input": "What is the total revenue for each category?",
+        "query": f"SELECT category, SUM(revenue) FROM {table_name} GROUP BY category;"
+    },
+    {
+        "input": "Show the top 5 products by sales.",
+        "query": f"SELECT product_name, sales FROM {table_name} ORDER BY sales DESC LIMIT 5;"
+    },
+    {
+        "input": "How many orders were placed in the last month?",
+        "query": f"SELECT COUNT(*) FROM {table_name} WHERE order_date >= DATE('now', '-1 month');"
+    }
+]
+# Step 4: Define the prompt templates
+system_prefix = """
+You are an expert data analyst who can convert natural language questions into SQL queries.
+Follow these guidelines:
+1. Only use the columns and tables provided.
+2. Use appropriate SQL syntax for SQLite.
+3. Ensure string comparisons are case-insensitive.
+4. Do not execute queries that could be harmful or unethical.
+5. Provide clear and concise SQL queries.
 """
+few_shot_prompt = FewShotPromptTemplate(
+    example_prompt=PromptTemplate.from_template("Question: {input}\nSQL Query: {query}"),
+    examples=few_shot_examples,
+    prefix=system_prefix,
+    suffix="Question: {input}\nSQL Query:",
+    input_variables=["input"]
+)
+# Step 5: Initialize the LLM and toolkit
+llm = ChatOpenAI(temperature=0, openai_api_key=openai_api_key)
+toolkit = SQLDatabaseToolkit(db=engine, llm=llm)
+# Step 6: Create the agent
+agent_prompt = ChatPromptTemplate.from_messages([
+    SystemMessagePromptTemplate(prompt=few_shot_prompt),
+    HumanMessagePromptTemplate.from_template("{input}")
+])
+sql_agent = create_sql_agent(
+    llm=llm,
+    toolkit=toolkit,
+    prompt=agent_prompt,
+    verbose=True,
+    agent_type="openai-functions",
+    max_iterations=5
+)
+# Step 7: Define the callback function
 def process_input():
     user_prompt = st.session_state['user_input']
             # Append user message to history
             st.session_state.history.append({"role": "user", "content": user_prompt})
+            # Use the agent to generate the SQL query
+            with st.spinner("Generating SQL query..."):
+                response = sql_agent.run(user_prompt)
+            # Check if the response contains SQL code
+            if "SELECT" in response.upper():
+                sql_query = response.strip()
+                logging.info(f"Generated SQL Query: {sql_query}")
+                # Attempt to execute SQL query and handle exceptions
+                try:
+                    result = pd.read_sql_query(sql_query, conn)
+                    if result.empty:
+                        assistant_response = "The query returned no results. Please try a different question."
+                        st.session_state.history.append({"role": "assistant", "content": assistant_response})
+                    else:
+                        # Limit the result to first 10 rows for display
+                        result_display = result.head(10)
+                        st.session_state.history.append({"role": "assistant", "content": "Here are the results:"})
+                        st.session_state.history.append({"role": "assistant", "content": result_display})
+                        # Generate insights based on the query result
+                        insights_template = """
+                        You are an expert data analyst. Based on the user's question and the SQL query result provided below, generate a concise analysis that includes key data insights and actionable recommendations. Limit the response to a maximum of 150 words.
+                        User's Question: {question}
+                        SQL Query Result:
+                        {result}
+                        Concise Analysis:
+                        """
+                        insights_prompt = PromptTemplate(template=insights_template, input_variables=['question', 'result'])
+                        insights_chain = LLMChain(llm=llm, prompt=insights_prompt)
+                        result_str = result_display.to_string(index=False)
+                        insights = insights_chain.run({'question': user_prompt, 'result': result_str})
+                        # Append the assistant's insights to the history
+                        st.session_state.history.append({"role": "assistant", "content": insights})
+                except Exception as e:
+                    logging.error(f"An error occurred during SQL execution: {e}")
+                    assistant_response = f"Error executing SQL query: {e}"
+                    st.session_state.history.append({"role": "assistant", "content": assistant_response})
+            else:
+                # Handle responses that do not contain SQL queries
+                assistant_response = response
                 st.session_state.history.append({"role": "assistant", "content": assistant_response})
+            # Evaluate the response for harmful content
+            try:
+                evaluator = load_evaluator("harmful_content", llm=llm)
+                eval_result = evaluator.evaluate_strings(
+                    input=user_prompt,
+                    prediction=response
+                )
+                if eval_result['flagged']:
+                    st.warning("The assistant's response may not be appropriate.")
                 else:
+                    logging.info("Response evaluated as appropriate.")
+            except Exception as e:
+                logging.error(f"An error occurred during evaluation: {e}")
         except Exception as e:
             logging.error(f"An error occurred: {e}")
             assistant_response = f"Error: {e}"
         # Reset the user_input in session state
         st.session_state['user_input'] = ''
+# Step 8: Display the conversation history
 for message in st.session_state.history:
     if message['role'] == 'user':
         st.markdown(f"**User:** {message['content']}")
             st.markdown(f"**Assistant:** {message['content']}")
 # Place the input field at the bottom with the callback
+st.text_input("Enter your message:", key='user_input', on_change=process_input)