Spaces:

arithescientist
/

GenBIChatbot

Sleeping

App Files Files Community

Ari commited on Sep 25, 2024

Commit

82bfc51

verified ·

1 Parent(s): 2d80a49

Update app.py

Browse files

Files changed (1) hide show

app.py +155 -66

app.py CHANGED Viewed

@@ -2,90 +2,179 @@ import os
 import streamlit as st
 import pandas as pd
 import sqlite3
-import openai
-from langchain import OpenAI
-from langchain_community.agent_toolkits.sql.base import create_sql_agent
-from langchain_community.utilities import SQLDatabase
-from langchain_community.document_loaders import CSVLoader
-from langchain_community.vectorstores import FAISS
-from langchain_community.embeddings import OpenAIEmbeddings
-from langchain.chains import RetrievalQA
 import sqlparse
 import logging
 # OpenAI API key (ensure it is securely stored)
-openai.api_key = os.getenv("OPENAI_API_KEY")
 # Step 1: Upload CSV data file (or use default)
 csv_file = st.file_uploader("Upload your CSV file", type=["csv"])
 if csv_file is None:
     data = pd.read_csv("default_data.csv")  # Use default CSV if no file is uploaded
-    st.write("Using default data.csv file.")
 else:
     data = pd.read_csv(csv_file)
     st.write(f"Data Preview ({csv_file.name}):")
     st.dataframe(data.head())
-# Step 2: Load CSV data into SQLite database with dynamic table name
-conn = sqlite3.connect(':memory:')  # Use an in-memory SQLite database
 table_name = csv_file.name.split('.')[0] if csv_file else "default_table"
 data.to_sql(table_name, conn, index=False, if_exists='replace')
 # SQL table metadata (for validation and schema)
 valid_columns = list(data.columns)
-# Step 3: Set up the SQL Database for LangChain
-db = SQLDatabase.from_uri('sqlite:///:memory:')
-db.raw_connection = conn  # Use the in-memory connection for LangChain
-# Step 4: Create the SQL agent with the correct parameter name
-sql_agent = create_sql_agent(OpenAI(temperature=0), db=db, verbose=True)
-# Step 5: Use FAISS with RAG for context retrieval
-embeddings = OpenAIEmbeddings()
-loader = CSVLoader(file_path=csv_file.name if csv_file else "default_data.csv")
-documents = loader.load()
-vector_store = FAISS.from_documents(documents, embeddings)
-retriever = vector_store.as_retriever()
-rag_chain = RetrievalQA.from_chain_type(llm=OpenAI(temperature=0), retriever=retriever)
-# Step 6: Define SQL validation helpers
-def validate_sql(query, valid_columns):
-    """Validates the SQL query by ensuring it references only valid columns."""
-    for column in valid_columns:
-        if column not in query:
-            return False
-    return True
-def validate_sql_with_sqlparse(query):
-    """Validates SQL syntax using sqlparse."""
-    parsed_query = sqlparse.parse(query)
-    return len(parsed_query) > 0
-# Step 7: Generate SQL query based on user input and run it with LangChain SQL Agent
-user_prompt = st.text_input("Enter your natural language prompt:")
-if user_prompt:
-    try:
-        # Step 8: Retrieve context using RAG
-        context = rag_chain.run(user_prompt)
-        st.write(f"Retrieved Context: {context}")
-        # Step 9: Generate SQL query using SQL agent
-        generated_sql = sql_agent.run(f"{user_prompt} {context}")
-        st.write(f"Generated SQL Query: {generated_sql}")
-        # Step 10: Validate SQL query
-        if not validate_sql_with_sqlparse(generated_sql):
-            st.write("Generated SQL is not valid.")
-        elif not validate_sql(generated_sql, valid_columns):
-            st.write("Generated SQL references invalid columns.")
         else:
-            # Step 11: Execute SQL query
-            result = pd.read_sql(generated_sql, conn)
-            st.write("Query Results:")
-            st.dataframe(result)
-    except Exception as e:
-        logging.error(f"An error occurred: {e}")
-        st.write(f"Error: {e}")

 import streamlit as st
 import pandas as pd
 import sqlite3
+import numpy as np  # For numerical operations
+from langchain import OpenAI, LLMChain, PromptTemplate
 import sqlparse
 import logging
+from sklearn.linear_model import LinearRegression  # For machine learning tasks
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error, r2_score
+# Initialize conversation history
+if 'history' not in st.session_state:
+    st.session_state.history = []
+# Set up logging
+logging.basicConfig(level=logging.ERROR)
 # OpenAI API key (ensure it is securely stored)
+openai_api_key = os.getenv("OPENAI_API_KEY")
+# Set OpenAI API key for langchain
+from langchain.llms import OpenAI as LangchainOpenAI
+LangchainOpenAI.api_key = openai_api_key
 # Step 1: Upload CSV data file (or use default)
+st.title("Data Science Chatbot")
 csv_file = st.file_uploader("Upload your CSV file", type=["csv"])
 if csv_file is None:
     data = pd.read_csv("default_data.csv")  # Use default CSV if no file is uploaded
+    st.write("Using default_data.csv file.")
 else:
     data = pd.read_csv(csv_file)
     st.write(f"Data Preview ({csv_file.name}):")
     st.dataframe(data.head())
+# Step 2: Load CSV data into a persistent SQLite database
+db_file = 'my_database.db'
+conn = sqlite3.connect(db_file)
 table_name = csv_file.name.split('.')[0] if csv_file else "default_table"
 data.to_sql(table_name, conn, index=False, if_exists='replace')
 # SQL table metadata (for validation and schema)
 valid_columns = list(data.columns)
+st.write(f"Valid columns: {valid_columns}")
+# Step 3: Define helper functions
+def extract_code(response):
+    """Extracts code enclosed between <CODE> and </CODE> tags."""
+    import re
+    pattern = r"<CODE>(.*?)</CODE>"
+    match = re.search(pattern, response, re.DOTALL)
+    if match:
+        return match.group(1).strip()
+    else:
+        return None
+# Step 4: Set up the LLM Chain to generate SQL queries or Python code
+template = """
+You are an expert data scientist assistant. Given a natural language question, the name of the table, and a list of valid columns, decide whether to generate a SQL query to retrieve data, perform statistical analysis, or build a simple machine learning model.
+Instructions:
+- If the question involves data retrieval or simple aggregations, generate a SQL query.
+- If the question requires statistical analysis, generate a Python code snippet using pandas and numpy.
+- If the question involves predictions or modeling, generate a Python code snippet using scikit-learn.
+- Ensure that you only use the columns provided.
+- Do not include any import statements in the code.
+- For case-insensitive string comparisons in SQL, use either 'LOWER(column) = LOWER(value)' or 'column = value COLLATE NOCASE', but do not use both together.
+- Provide the code between <CODE> and </CODE> tags.
+Question: {question}
+Table name: {table_name}
+Valid columns: {columns}
+Response:
+"""
+prompt = PromptTemplate(template=template, input_variables=['question', 'table_name', 'columns'])
+llm = LangchainOpenAI(temperature=0)
+sql_generation_chain = LLMChain(llm=llm, prompt=prompt)
+# Define the callback function
+def process_input():
+    user_prompt = st.session_state['user_input']
+    if user_prompt:
+        try:
+            # Append user message to history
+            st.session_state.history.append({"role": "user", "content": user_prompt})
+            if "columns" in user_prompt.lower():
+                assistant_response = f"The columns are: {', '.join(valid_columns)}"
+                st.session_state.history.append({"role": "assistant", "content": assistant_response})
+            else:
+                columns = ', '.join(valid_columns)
+                response = sql_generation_chain.run({
+                    'question': user_prompt,
+                    'table_name': table_name,
+                    'columns': columns
+                })
+                # Extract code from response
+                code = extract_code(response)
+                if code:
+                    # Determine if the code is SQL or Python
+                    if code.strip().lower().startswith('select'):
+                        # It's a SQL query
+                        st.write(f"Generated SQL Query:\n{code}")
+                        try:
+                            # Execute the SQL query
+                            result = pd.read_sql_query(code, conn)
+                            assistant_response = f"Generated SQL Query:\n{code}"
+                            st.session_state.history.append({"role": "assistant", "content": assistant_response})
+                            st.session_state.history.append({"role": "assistant", "content": result})
+                        except Exception as e:
+                            logging.error(f"An error occurred during SQL execution: {e}")
+                            assistant_response = f"Error executing SQL query: {e}"
+                            st.session_state.history.append({"role": "assistant", "content": assistant_response})
+                    else:
+                        # It's Python code
+                        st.write(f"Generated Python Code:\n{code}")
+                        try:
+                            # Prepare the local namespace
+                            local_vars = {
+                                'pd': pd,
+                                'np': np,
+                                'data': data.copy(),
+                                'result': None,
+                                'LinearRegression': LinearRegression,
+                                'train_test_split': train_test_split,
+                                'mean_squared_error': mean_squared_error,
+                                'r2_score': r2_score
+                            }
+                            exec(code, {}, local_vars)
+                            result = local_vars.get('result')
+                            if result is not None:
+                                assistant_response = "Result:"
+                                st.session_state.history.append({"role": "assistant", "content": assistant_response})
+                                st.session_state.history.append({"role": "assistant", "content": result})
+                            else:
+                                assistant_response = "Code executed successfully."
+                                st.session_state.history.append({"role": "assistant", "content": assistant_response})
+                        except Exception as e:
+                            logging.error(f"An error occurred during code execution: {e}")
+                            assistant_response = f"Error executing code: {e}"
+                            st.session_state.history.append({"role": "assistant", "content": assistant_response})
+                else:
+                    assistant_response = response.strip()
+                    st.session_state.history.append({"role": "assistant", "content": assistant_response})
+        except Exception as e:
+            logging.error(f"An error occurred: {e}")
+            assistant_response = f"Error: {e}"
+            st.session_state.history.append({"role": "assistant", "content": assistant_response})
+        # Reset the user_input in session state
+        st.session_state['user_input'] = ''
+# Display the conversation history
+for message in st.session_state.history:
+    if message['role'] == 'user':
+        st.markdown(f"**User:** {message['content']}")
+    elif message['role'] == 'assistant':
+        content = message['content']
+        if isinstance(content, pd.DataFrame):
+            st.markdown("**Assistant:** Here are the results:")
+            st.dataframe(content)
+        elif isinstance(content, (int, float)):
+            st.markdown(f"**Assistant:** {content}")
+        elif isinstance(content, dict):
+            st.markdown("**Assistant:** Here are the results:")
+            st.json(content)
         else:
+            st.markdown(f"**Assistant:** {content}")
+# Place the input field at the bottom with the callback
+st.text_input("Enter your message:", key='user_input', on_change=process_input)