| | import streamlit as st |
| | from lida import Manager, TextGenerationConfig, llm |
| | from lida.datamodel import Goal |
| | import os |
| | import pandas as pd |
| |
|
| |
|
| | |
| | os.makedirs("data", exist_ok=True) |
| |
|
| | st.set_page_config( |
| | page_title="KPMG BI", |
| | page_icon="📊", |
| | ) |
| |
|
| | st.write("# KPMG Analytics") |
| |
|
| | st.sidebar.write("## Setup") |
| |
|
| | |
| | openai_key = os.getenv("OPENAI_API_KEY") |
| |
|
| | if not openai_key: |
| | openai_key = st.sidebar.text_input("Enter your key:") |
| | if openai_key: |
| | display_key = openai_key[:2] + "*" * (len(openai_key) - 5) + openai_key[-3:] |
| | st.sidebar.write(f"Current key: {display_key}") |
| | else: |
| | st.sidebar.write("Please enter key.") |
| | else: |
| | display_key = openai_key[:2] + "*" * (len(openai_key) - 5) + openai_key[-3:] |
| | st.sidebar.write(f"OpenAI API key loaded from environment variable: {display_key}") |
| |
|
| | st.markdown( |
| | """ |
| | |
| | ---- |
| | """) |
| |
|
| | |
| | if openai_key: |
| | |
| | selected_dataset = None |
| |
|
| | |
| | st.sidebar.write("## Text Generation Model") |
| | models = ["gpt4", "gpt-3.5-turbo", "gpt-3.5-turbo-16k"] |
| | selected_model = st.sidebar.selectbox( |
| | 'Choose a model', |
| | options=models, |
| | index=0 |
| | ) |
| |
|
| | |
| | |
| | temperature = st.sidebar.slider( |
| | "Temperature", |
| | min_value=0.0, |
| | max_value=1.0, |
| | value=0.0) |
| |
|
| | |
| | use_cache = st.sidebar.checkbox("Use cache", value=True) |
| |
|
| | |
| | st.sidebar.write("## Data Summarization") |
| | st.sidebar.write("### Choose a dataset") |
| |
|
| | datasets = [ |
| | {"label": "Select a dataset", "url": None}, |
| | {"label": "Cars", "url": "https://raw.githubusercontent.com/uwdata/draco/master/data/cars.csv"}, |
| | {"label": "Weather", "url": "https://raw.githubusercontent.com/uwdata/draco/master/data/weather.json"}, |
| | ] |
| |
|
| | selected_dataset_label = st.sidebar.selectbox( |
| | 'Choose a dataset', |
| | options=[dataset["label"] for dataset in datasets], |
| | index=0 |
| | ) |
| |
|
| | upload_own_data = st.sidebar.checkbox("Upload your own data") |
| |
|
| | if upload_own_data: |
| | uploaded_file = st.sidebar.file_uploader("Choose a CSV or JSON file", type=["csv", "json"]) |
| |
|
| | if uploaded_file is not None: |
| | |
| | file_name, file_extension = os.path.splitext(uploaded_file.name) |
| |
|
| | |
| | if file_extension.lower() == ".csv": |
| | data = pd.read_csv(uploaded_file) |
| | elif file_extension.lower() == ".json": |
| | data = pd.read_json(uploaded_file) |
| |
|
| | |
| | uploaded_file_path = os.path.join("data", uploaded_file.name) |
| | data.to_csv(uploaded_file_path, index=False) |
| |
|
| | selected_dataset = uploaded_file_path |
| |
|
| | datasets.append({"label": file_name, "url": uploaded_file_path}) |
| |
|
| | |
| | else: |
| | selected_dataset = datasets[[dataset["label"] |
| | for dataset in datasets].index(selected_dataset_label)]["url"] |
| |
|
| | if not selected_dataset: |
| | st.info("To continue, select a dataset from the sidebar on the left or upload your own.") |
| |
|
| | st.sidebar.write("### Choose a summarization method") |
| | |
| | summarization_methods = [ |
| | {"label": "llm", |
| | "description": |
| | "Uses the AI to generate annotate the default summary, adding details such as semantic types for columns and dataset description"}, |
| | {"label": "default", |
| | "description": "Uses dataset column statistics and column names as the summary"}, |
| |
|
| | {"label": "columns", "description": "Uses the dataset column names as the summary"}] |
| |
|
| | |
| | selected_method_label = st.sidebar.selectbox( |
| | 'Choose a method', |
| | options=[method["label"] for method in summarization_methods], |
| | index=0 |
| | ) |
| |
|
| | selected_method = summarization_methods[[ |
| | method["label"] for method in summarization_methods].index(selected_method_label)]["label"] |
| |
|
| | |
| | selected_summary_method_description = summarization_methods[[ |
| | method["label"] for method in summarization_methods].index(selected_method_label)]["description"] |
| |
|
| | if selected_method: |
| | st.sidebar.markdown( |
| | f"<span> {selected_summary_method_description} </span>", |
| | unsafe_allow_html=True) |
| |
|
| | |
| | if openai_key and selected_dataset and selected_method: |
| | lida = Manager(text_gen=llm("openai", api_key=openai_key)) |
| | textgen_config = TextGenerationConfig( |
| | n=1, |
| | temperature=temperature, |
| | model=selected_model, |
| | use_cache=use_cache) |
| |
|
| | st.write("## Summary") |
| | |
| | summary = lida.summarize( |
| | selected_dataset, |
| | summary_method=selected_method, |
| | textgen_config=textgen_config) |
| |
|
| | if "dataset_description" in summary: |
| | st.write(summary["dataset_description"]) |
| |
|
| | if "fields" in summary: |
| | fields = summary["fields"] |
| | nfields = [] |
| | for field in fields: |
| | flatted_fields = {} |
| | flatted_fields["column"] = field["column"] |
| | |
| | for row in field["properties"].keys(): |
| | if row != "samples": |
| | flatted_fields[row] = field["properties"][row] |
| | else: |
| | flatted_fields[row] = str(field["properties"][row]) |
| | |
| | nfields.append(flatted_fields) |
| | nfields_df = pd.DataFrame(nfields) |
| | st.write(nfields_df) |
| | else: |
| | st.write(str(summary)) |
| |
|
| | |
| | if summary: |
| | st.sidebar.write("### Prompt Selection") |
| |
|
| | num_goals = st.sidebar.slider( |
| | "Number of Prompts to generate", |
| | min_value=1, |
| | max_value=10, |
| | value=4) |
| | own_goal = st.sidebar.checkbox("Add Your Own Prompt") |
| |
|
| | |
| | goals = lida.goals(summary, n=num_goals, textgen_config=textgen_config) |
| | st.write(f"## Prompts ({len(goals)})") |
| |
|
| | default_goal = goals[0].question |
| | goal_questions = [goal.question for goal in goals] |
| |
|
| | if own_goal: |
| | user_goal = st.sidebar.text_input("Describe Your Prompt") |
| |
|
| | if user_goal: |
| |
|
| | new_goal = Goal(question=user_goal, visualization=user_goal, rationale="") |
| | goals.append(new_goal) |
| | goal_questions.append(new_goal.question) |
| |
|
| | selected_goal = st.selectbox('Choose a prompt', options=goal_questions, index=0) |
| |
|
| | |
| | selected_goal_index = goal_questions.index(selected_goal) |
| | st.write(goals[selected_goal_index]) |
| |
|
| | selected_goal_object = goals[selected_goal_index] |
| |
|
| | |
| | if selected_goal_object: |
| | st.sidebar.write("## Visualization Library") |
| | visualization_libraries = ["", "matplotlib", "plotly"] |
| |
|
| | selected_library = st.sidebar.selectbox( |
| | 'Choose a visualization library', |
| | options=visualization_libraries, |
| | index=0 |
| | ) |
| |
|
| | |
| | st.write("## Visualizations") |
| |
|
| | |
| | num_visualizations = st.sidebar.slider( |
| | "Number of visualizations to generate", |
| | min_value=1, |
| | max_value=10, |
| | value=2) |
| |
|
| | textgen_config = TextGenerationConfig( |
| | n=num_visualizations, temperature=temperature, |
| | model=selected_model, |
| | use_cache=use_cache) |
| |
|
| | |
| | visualizations = lida.visualize( |
| | summary=summary, |
| | goal=selected_goal_object, |
| | textgen_config=textgen_config, |
| | library=selected_library) |
| |
|
| | viz_titles = [f'Visualization {i+1}' for i in range(len(visualizations))] |
| |
|
| | selected_viz_title = st.selectbox('Choose a visualization', options=viz_titles, index=0) |
| |
|
| | selected_viz = visualizations[viz_titles.index(selected_viz_title)] |
| |
|
| | if selected_viz.raster: |
| | from PIL import Image |
| | import io |
| | import base64 |
| |
|
| | imgdata = base64.b64decode(selected_viz.raster) |
| | img = Image.open(io.BytesIO(imgdata)) |
| | st.image(img, caption=selected_viz_title, use_column_width=True) |
| |
|
| | st.write("### Visualization Code") |
| | st.code(selected_viz.code) |
| |
|