Spaces:

azizalto
/

AutoPySQLify-NYC-hackathon

Sleeping

App Files Files Community

Aziz Alto commited on Apr 24, 2023

Commit

7c72cf3

1 Parent(s): 51c318e

Suggest customized questions for any dataset 🔥 powered by GPT

Browse files

Files changed (1) hide show

app.py +143 -63

app.py CHANGED Viewed

@@ -6,12 +6,12 @@ import pandas as pd
 import streamlit as st
 import streamlit_ace as stace
 import duckdb
-import numpy as np # for user session
-import scipy # for user session
 import plotly_express
-import plotly.express as px # for user session
-import plotly.figure_factory as ff # for user session
-import matplotlib.pyplot as plt # for user session
 import sklearn
 from ydata_profiling import ProfileReport
 from streamlit_pandas_profiling import st_profile_report
@@ -24,7 +24,16 @@ header = """
 > `GPT-powered` and `Jupyter notebook-inspired`
 """
 st.markdown(header, unsafe_allow_html=True)
-st.markdown("> <sub>[NYC AI Hackathon](https://tech.cornell.edu/events/nyc-gpt-llm-hackathon/) April, 23 2023</sub>", unsafe_allow_html=True)
 if "OPENAI_API_KEY" not in os.environ:
     os.environ["OPENAI_API_KEY"] = st.text_input("OpenAI API Key", type="password")
@@ -34,6 +43,7 @@ p = st.write
 print = st.write
 display = st.write
 @st.cache_data
 def _read_csv(f, **kwargs):
     df = pd.read_csv(f, on_bad_lines="skip", **kwargs)
@@ -45,8 +55,9 @@ def _read_csv(f, **kwargs):
 def timer(func):
     def wrapper_function(*args, **kwargs):
         start_time = time.time()
-        func(*args,  **kwargs)
         st.write(f"`{(time.time() - start_time):.2f}s.`")
     return wrapper_function
@@ -59,7 +70,7 @@ SAMPLE_DATA = {
     "Country Table": "https://raw.githubusercontent.com/datasciencedojo/datasets/master/WorldDBTables/CountryTable.csv",
     "World Cities": "https://raw.githubusercontent.com/dr5hn/countries-states-cities-database/master/csv/cities.csv",
     "World States": "https://raw.githubusercontent.com/dr5hn/countries-states-cities-database/master/csv/states.csv",
-    "World Countries": "https://raw.githubusercontent.com/dr5hn/countries-states-cities-database/master/csv/countries.csv"
 }
@@ -78,7 +89,9 @@ def read_data():
             if url:
                 file_ = url
         with col3:
-            selected = st.selectbox("Select a sample dataset", options=[""] + list(SAMPLE_DATA))
             if selected:
                 file_ = SAMPLE_DATA[selected]
@@ -122,12 +135,28 @@ def code_editor(language, hint, show_panel, key=None, content=None):
         _KEYBINDINGS = stace.KEYBINDINGS
         col21, col22 = st.columns(2)
         with col21:
-            theme = st.selectbox("Theme", options=[default_theme] + _THEMES, key=f"{language}1{key}")
-            tab_size = st.slider("Tab size", min_value=1, max_value=8, value=4, key=f"{language}2{key}")
         with col22:
-            keybinding = st.selectbox("Keybinding", options=[_KEYBINDINGS[-2]] + _KEYBINDINGS, key=f"{language}3{key}")
-            font_size = st.slider("Font size", min_value=5, max_value=24, value=14, key=f"{language}4{key}")
-        height = st.slider("Editor height", value=130, max_value=777,key=f"{language}5{key}")
         # kwargs = {theme: theme, keybinding: keybinding} # TODO: DRY
     if not show_panel:
         placeholder.empty()
@@ -143,7 +172,7 @@ def code_editor(language, hint, show_panel, key=None, content=None):
         theme=theme,
         font_size=font_size,
         tab_size=tab_size,
-        key=key
     )
     # Display editor's content as you type
@@ -167,13 +196,7 @@ def download(df, key, save_as="results.csv"):
         return _df.to_csv().encode("utf-8")
     csv = convert_df(df)
-    st.download_button(
-        "Download",
-        csv,
-        save_as,
-        "text/csv",
-        key=key
-    )
 def display_results(query: str, result: pd.DataFrame, key: str):
@@ -186,7 +209,7 @@ def display_results(query: str, result: pd.DataFrame, key: str):
 def run_python_script(user_script, key):
     if user_script.startswith("st.") or ";" in user_script:
         py = user_script
-    elif user_script.endswith("?"): # -- same as ? in Jupyter Notebook
         in_ = user_script.replace("?", "")
         py = f"st.help({in_})"
     else:
@@ -278,7 +301,7 @@ def display_example_snippets():
 class GPTWrapper:
-    def __init__(self):#, df_info):
         from gpt import AnthropicSerivce, OpenAIService
@@ -289,6 +312,7 @@ class GPTWrapper:
     @st.cache_data
     def ask_sql(df_info, question):
         from gpt import OpenAIService
         openai_model = OpenAIService()
         prompt = GPTWrapper().build_sql_prompt(df_info, question)
         res = openai_model.prompt(prompt)
@@ -298,18 +322,19 @@ class GPTWrapper:
     @st.cache_data
     def ask_python(df_info, question):
         from gpt import OpenAIService
         openai_model = OpenAIService()
         prompt = GPTWrapper().build_python_prompt(df_info, question)
         res = openai_model.prompt(prompt)
         return res, prompt
     @staticmethod
     @st.cache_data
     def build_sql_prompt(df_info, question):
         prompt = f"""I have data in a pandas dataframe, here is the data schema: {df_info}
         Next, I will ask you a question. Assume the table name is `df`.
-        And you will answer in writing a SQL query only. {question}
         """
         return prompt
@@ -317,30 +342,49 @@ class GPTWrapper:
     @st.cache_data
     def build_python_prompt(df_info, question):
         prompt = f"""I have data in a pandas dataframe, here is the dataframe schema: {df_info}
-        Next, I will ask you a question. And you will answer in writing a Python code only.
-        Assume the data is stored in a variable named `df`.
-        Here are some instructions for the generated Python code:
-        - You should always use the variable `df` to refer to the dataframe.
-        - You should not include any markdown syntax or any other syntax that is not Python in the answer.
         - Import any required libraries in the first line of the generated code.
-        - Just show the Python code only, don't include any Python comments or English explanation in the answer text.
-        - If the generarted code has multiple Python lines, every Python line must end with a semicolon (;).
-        - If the answer is not a plot or a figure, always use print to print the answer using print().
-        - If the answer requires plotting, generate a plot using plotly_express and show it using st.plotly_chart(fig).
         Here is the question: {question}
         """
         return prompt
 def ask_gpt_sql(df_info, key):
     # -- GPT AI
     # agi = GPTWrapper(df_info=df_info)
-    question = st.text_input("Ask a question about the dataset to get a SQL query that answers the question",
-                             placeholder="How many rows are there in the dataset?",
-                             key=key
-                             )
     if question:
         # res, prompt = agi.ask_sql(df_info, question)
         res, prompt = GPTWrapper().ask_sql(df_info, question)
@@ -349,22 +393,34 @@ def ask_gpt_sql(df_info, key):
         st.code(sql_query, language="sql")
         return sql_query
 def ask_gpt_python(df_info, key):
     # -- GPT AI
-    # agi = GPTWrapper(df_info=df_info)
-    question = st.text_input("Ask a question about the dataset to get a Python code that answers the question",
-                             placeholder="How many rows and columns are there in the dataset?",
-                             key=key
-                             )
     if question:
-        # res, prompt = agi.ask_python(df_info, question)
         res, prompt = GPTWrapper().ask_python(df_info, question)
-        # st.markdown(f"```{prompt}```")
         python_code = res.choices[0].message.content
         st.code(python_code, language="python")
-        # st.markdown(f"```{python_code}```", unsafe_allow_html=True)
         return python_code
 if __name__ == "__main__":
     show_examples = docs()
@@ -381,8 +437,6 @@ if __name__ == "__main__":
     df.info(buf=sio)
     df_info = sio.getvalue()
     # st.markdown(f"```{df_info}```", unsafe_allow_html=True)
     # run and execute SQL script
     def sql_cells(df):
@@ -394,24 +448,32 @@ if __name__ == "__main__":
         Describe the table:
             DESCRIBE TABLE df
         """
-        number_cells = st.sidebar.number_input("Number of SQL cells to use", value=1, max_value=40)
         for i in range(number_cells):
             key = f"sql{i}"
             col1, col2 = st.columns([2, 1])
             st.markdown("<br>", unsafe_allow_html=True)
-            show_panel = False #col2.checkbox("Show cell config panel", key=f"{i}-sql")
             col1.write(f"> `IN[{i+1}]`")
-            # with col2:
             # -- GPT AI
             query = ask_gpt_sql(df_info, key=f"{key}-gpt")
             content = None
-            if query and st.button("Use SQL", key=f"{key}-use-sql"):
                 content = query
-            # with col1:
-            sql = code_editor("sql", hint, show_panel=show_panel, key=key, content=content if content else None)
             if sql:
                 st.code(sql, language="sql")
                 st.write(f"`OUT[{i+1}]`")
@@ -451,27 +513,42 @@ if __name__ == "__main__":
             st.bar_chart(groups[i].mean())
         ```
         """
-        number_cells = st.sidebar.number_input("Number of Python cells to use", value=1, max_value=40, min_value=1, help=help)
         for i in range(number_cells):
             # st.markdown("<br><br><br>", unsafe_allow_html=True)
             col1, col2 = st.columns([2, 1])
             # col1.write(f"> `IN[{i+1}]`")
-            show_panel = False # col2.checkbox("Show cell config panel", key=f"panel{i}")
             # -- GPT AI
             query = ask_gpt_python(df_info, key=f"{i}-gpt")
             content = None
-            if query and st.checkbox("Use generated code", key=f"{i}-use-python"):
                 content = query
-            user_script = code_editor("python", hint, show_panel=show_panel, key=i, content=content if content else None)
             if user_script:
-                df.rename(columns={"lng": "lon"}, inplace=True) # hot-fix for "World Population" dataset
                 st.write(f"> `IN[{i+1}]`")
                 st.code(user_script, language="python")
                 st.write(f"> `OUT[{i+1}]`")
                 run_python_script(user_script, key=f"{user_script}{i}")
     if st.sidebar.checkbox("Show SQL cells", value=True):
         sql_cells(df)
     if st.sidebar.checkbox("Show Python cells", value=True):
@@ -479,7 +556,10 @@ if __name__ == "__main__":
     st.sidebar.write("---")
-    if st.sidebar.checkbox("Generate Data Profile Report", help="pandas profiling, generated by [ydata-profiling](https://github.com/ydataai/ydata-profiling)"):
         st.write("---")
         st.header("Data Profiling")
         profile = data_profiler(df)

 import streamlit as st
 import streamlit_ace as stace
 import duckdb
+import numpy as np  # for user session
+import scipy  # for user session
 import plotly_express
+import plotly.express as px  # for user session
+import plotly.figure_factory as ff  # for user session
+import matplotlib.pyplot as plt  # for user session
 import sklearn
 from ydata_profiling import ProfileReport
 from streamlit_pandas_profiling import st_profile_report
 > `GPT-powered` and `Jupyter notebook-inspired`
 """
 st.markdown(header, unsafe_allow_html=True)
+st.markdown(
+    "> <sub>[NYC AI Hackathon](https://tech.cornell.edu/events/nyc-gpt-llm-hackathon/) April, 23 2023</sub>",
+    unsafe_allow_html=True,
+)
+if "ANTHROPIC_API_KEY" not in os.environ:
+    os.environ["ANTHROPIC_API_KEY"] = st.text_input(
+        "Anthropic API Key", type="password"
+    )
 if "OPENAI_API_KEY" not in os.environ:
     os.environ["OPENAI_API_KEY"] = st.text_input("OpenAI API Key", type="password")
 print = st.write
 display = st.write
 @st.cache_data
 def _read_csv(f, **kwargs):
     df = pd.read_csv(f, on_bad_lines="skip", **kwargs)
 def timer(func):
     def wrapper_function(*args, **kwargs):
         start_time = time.time()
+        func(*args, **kwargs)
         st.write(f"`{(time.time() - start_time):.2f}s.`")
     return wrapper_function
     "Country Table": "https://raw.githubusercontent.com/datasciencedojo/datasets/master/WorldDBTables/CountryTable.csv",
     "World Cities": "https://raw.githubusercontent.com/dr5hn/countries-states-cities-database/master/csv/cities.csv",
     "World States": "https://raw.githubusercontent.com/dr5hn/countries-states-cities-database/master/csv/states.csv",
+    "World Countries": "https://raw.githubusercontent.com/dr5hn/countries-states-cities-database/master/csv/countries.csv",
 }
             if url:
                 file_ = url
         with col3:
+            selected = st.selectbox(
+                "Select a sample dataset", options=[""] + list(SAMPLE_DATA)
+            )
             if selected:
                 file_ = SAMPLE_DATA[selected]
         _KEYBINDINGS = stace.KEYBINDINGS
         col21, col22 = st.columns(2)
         with col21:
+            theme = st.selectbox(
+                "Theme", options=[default_theme] + _THEMES, key=f"{language}1{key}"
+            )
+            tab_size = st.slider(
+                "Tab size", min_value=1, max_value=8, value=4, key=f"{language}2{key}"
+            )
         with col22:
+            keybinding = st.selectbox(
+                "Keybinding",
+                options=[_KEYBINDINGS[-2]] + _KEYBINDINGS,
+                key=f"{language}3{key}",
+            )
+            font_size = st.slider(
+                "Font size",
+                min_value=5,
+                max_value=24,
+                value=14,
+                key=f"{language}4{key}",
+            )
+        height = st.slider(
+            "Editor height", value=130, max_value=777, key=f"{language}5{key}"
+        )
         # kwargs = {theme: theme, keybinding: keybinding} # TODO: DRY
     if not show_panel:
         placeholder.empty()
         theme=theme,
         font_size=font_size,
         tab_size=tab_size,
+        key=key,
     )
     # Display editor's content as you type
         return _df.to_csv().encode("utf-8")
     csv = convert_df(df)
+    st.download_button("Download", csv, save_as, "text/csv", key=key)
 def display_results(query: str, result: pd.DataFrame, key: str):
 def run_python_script(user_script, key):
     if user_script.startswith("st.") or ";" in user_script:
         py = user_script
+    elif user_script.endswith("?"):  # -- same as ? in Jupyter Notebook
         in_ = user_script.replace("?", "")
         py = f"st.help({in_})"
     else:
 class GPTWrapper:
+    def __init__(self):  # , df_info):
         from gpt import AnthropicSerivce, OpenAIService
     @st.cache_data
     def ask_sql(df_info, question):
         from gpt import OpenAIService
         openai_model = OpenAIService()
         prompt = GPTWrapper().build_sql_prompt(df_info, question)
         res = openai_model.prompt(prompt)
     @st.cache_data
     def ask_python(df_info, question):
         from gpt import OpenAIService
         openai_model = OpenAIService()
         prompt = GPTWrapper().build_python_prompt(df_info, question)
         res = openai_model.prompt(prompt)
         return res, prompt
     @staticmethod
     @st.cache_data
     def build_sql_prompt(df_info, question):
         prompt = f"""I have data in a pandas dataframe, here is the data schema: {df_info}
         Next, I will ask you a question. Assume the table name is `df`.
+        And you will answer in writing a SQL query only by using the table `df` and shema above.
+        Here is the question: {question}.
         """
         return prompt
     @st.cache_data
     def build_python_prompt(df_info, question):
         prompt = f"""I have data in a pandas dataframe, here is the dataframe schema: {df_info}
+        Next, I will ask you a question. Assume the data is stored in a variable named `df`.
+        And you will answer in writing a Python code only by using the variable `df` and shema above.
+        Here are some instructions you must follow when writing the code:
+        - The answer must be Python code only.
+        - The code must include column names from the dataframe schema above only.
         - Import any required libraries in the first line of the generated code.
+        - Use `df` as the variable name for the dataframe.
+        - Don't include any comments in the code.
+        - Every line of code must end with `;`.
+        - For non-plotting answers, you must use `print()` to print the answer.
+        - For plotting answers, one of the folowing options must be used:
+            - `st.pyplot(fig)` to display the plot in the Streamlit app.
+            - plotly_express to generate a plot and `st.plotly_chart()` to show it.
         Here is the question: {question}
         """
         return prompt
+    @staticmethod
+    @st.cache_data
+    def suggest_questions(df_info, language):
+        prompt = f"""
+        {df_info}
+        What questions (exploratory or explanatory) can be asked about this dataset to analyze the data as a whole using {language}? Be as specific as possible based on the data schema above.
+        """
+        from gpt import OpenAIService
+        openai_model = OpenAIService()
+        res = openai_model.prompt(prompt)
+        return res, prompt
 def ask_gpt_sql(df_info, key):
     # -- GPT AI
     # agi = GPTWrapper(df_info=df_info)
+    question = st.text_input(
+        "Ask a question about the dataset to get a SQL query that answers the question",
+        placeholder="How many rows are there in the dataset?",
+        key=key,
+    )
     if question:
         # res, prompt = agi.ask_sql(df_info, question)
         res, prompt = GPTWrapper().ask_sql(df_info, question)
         st.code(sql_query, language="sql")
         return sql_query
+    with st.expander("Example questions"):
+        res, prompt = GPTWrapper().suggest_questions(df_info, "SQL")
+        suggestions = res.choices[0].message.content
+        st.markdown("Here are some example questions:")
+        st.markdown(f"```{suggestions}```", unsafe_allow_html=True)
 def ask_gpt_python(df_info, key):
     # -- GPT AI
+    question = st.text_input(
+        "Ask a question about the dataset to get a Python code that answers the question",
+        placeholder="How many rows and columns are there in the dataset?",
+        key=key,
+    )
     if question:
         res, prompt = GPTWrapper().ask_python(df_info, question)
         python_code = res.choices[0].message.content
         st.code(python_code, language="python")
         return python_code
+    with st.expander("Example questions"):
+        res, prompt = GPTWrapper().suggest_questions(df_info, "Python")
+        suggestions = res.choices[0].message.content
+        st.markdown("Here are some example questions:")
+        st.markdown(suggestions, unsafe_allow_html=True)
 if __name__ == "__main__":
     show_examples = docs()
     df.info(buf=sio)
     df_info = sio.getvalue()
     # st.markdown(f"```{df_info}```", unsafe_allow_html=True)
     # run and execute SQL script
     def sql_cells(df):
         Describe the table:
             DESCRIBE TABLE df
         """
+        number_cells = st.sidebar.number_input(
+            "Number of SQL cells to use", value=1, max_value=40
+        )
         for i in range(number_cells):
             key = f"sql{i}"
             col1, col2 = st.columns([2, 1])
             st.markdown("<br>", unsafe_allow_html=True)
+            show_panel = (
+                False  # col2.checkbox("Show cell config panel", key=f"{i}-sql")
+            )
             col1.write(f"> `IN[{i+1}]`")
             # -- GPT AI
             query = ask_gpt_sql(df_info, key=f"{key}-gpt")
             content = None
+            if query and st.button("Run the generated code", key=f"{key}-use-sql"):
                 content = query
+            sql = code_editor(
+                "sql",
+                hint,
+                show_panel=show_panel,
+                key=key,
+                content=content if content else None,
+            )
             if sql:
                 st.code(sql, language="sql")
                 st.write(f"`OUT[{i+1}]`")
             st.bar_chart(groups[i].mean())
         ```
         """
+        number_cells = st.sidebar.number_input(
+            "Number of Python cells to use",
+            value=1,
+            max_value=40,
+            min_value=1,
+            help=help,
+        )
         for i in range(number_cells):
             # st.markdown("<br><br><br>", unsafe_allow_html=True)
             col1, col2 = st.columns([2, 1])
             # col1.write(f"> `IN[{i+1}]`")
+            show_panel = (
+                False  # col2.checkbox("Show cell config panel", key=f"panel{i}")
+            )
             # -- GPT AI
             query = ask_gpt_python(df_info, key=f"{i}-gpt")
             content = None
+            if query and st.checkbox("Run the generated code", key=f"{i}-use-python"):
                 content = query
+            user_script = code_editor(
+                "python",
+                hint,
+                show_panel=show_panel,
+                key=i,
+                content=content if content else None,
+            )
             if user_script:
+                df.rename(
+                    columns={"lng": "lon"}, inplace=True
+                )  # hot-fix for "World Population" dataset
                 st.write(f"> `IN[{i+1}]`")
                 st.code(user_script, language="python")
                 st.write(f"> `OUT[{i+1}]`")
                 run_python_script(user_script, key=f"{user_script}{i}")
     if st.sidebar.checkbox("Show SQL cells", value=True):
         sql_cells(df)
     if st.sidebar.checkbox("Show Python cells", value=True):
     st.sidebar.write("---")
+    if st.sidebar.checkbox(
+        "Generate Data Profile Report",
+        help="pandas profiling, generated by [ydata-profiling](https://github.com/ydataai/ydata-profiling)",
+    ):
         st.write("---")
         st.header("Data Profiling")
         profile = data_profiler(df)