| import streamlit as st |
| import pandas as pd |
| import plotly.express as px |
| from langchain_community.llms import OpenAI |
| from langchain.agents.agent_types import AgentType |
| from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent |
| import textwrap |
| from concurrent.futures import ThreadPoolExecutor, as_completed |
| from functools import partial |
| import time |
|
|
| |
| if 'step' not in st.session_state: |
| st.session_state.step = 1 |
| if 'dataframes' not in st.session_state: |
| st.session_state.dataframes = {} |
| if 'chat_history' not in st.session_state: |
| st.session_state.chat_history = [] |
| if 'cleaning_operations' not in st.session_state: |
| st.session_state.cleaning_operations = {} |
|
|
| def main(): |
| st.title("Data Analysis Chat App") |
|
|
| if st.session_state.step == 1: |
| step_1_upload_and_analyze() |
| elif st.session_state.step == 2: |
| step_2_clean_data() |
| elif st.session_state.step == 3: |
| step_3_chat_with_data() |
|
|
| def step_1_upload_and_analyze(): |
| st.subheader("Step 1: Upload and Analyze Data") |
|
|
| uploaded_files = st.file_uploader("Upload CSV files", type="csv", accept_multiple_files=True) |
| if uploaded_files: |
| for file in uploaded_files: |
| df = pd.read_csv(file) |
| st.session_state.dataframes[file.name] = df |
| st.success(f"Uploaded: {file.name}") |
|
|
| if st.button("Analyze Data"): |
| for name, df in st.session_state.dataframes.items(): |
| st.write(f"Analysis for {name}:") |
| st.write(f"Shape: {df.shape}") |
| st.write("Columns:") |
| st.write(df.columns.tolist()) |
| st.write("Preview:") |
| st.write(df.head()) |
| st.write("---") |
|
|
| if st.button("Proceed to Data Cleaning"): |
| st.session_state.step = 2 |
|
|
| def step_2_clean_data(): |
| st.subheader("Step 2: Clean Data") |
|
|
| llm = OpenAI(temperature=0) |
| |
| for name, df in st.session_state.dataframes.items(): |
| st.write(f"Cleaning recommendations for {name}:") |
| |
| |
| summary = f"Dataframe '{name}' summary:\n" |
| summary += f"- Shape: {df.shape}\n" |
| summary += f"- Columns: {', '.join(df.columns)}\n" |
| summary += "- Data types:\n" |
| for col, dtype in df.dtypes.items(): |
| summary += f" - {col}: {dtype}\n" |
| summary += "- Sample data (first 5 rows):\n" |
| summary += df.head().to_string() |
|
|
| |
| chunk_size = 1500 |
| chunks = textwrap.wrap(summary, chunk_size) |
|
|
| cleaning_recommendations = [] |
| with st.spinner("Analyzing data and generating recommendations..."): |
| for i, chunk in enumerate(chunks): |
| chunk_result = analyze_chunk(llm, df, chunk) |
| cleaning_recommendations.append(chunk_result) |
|
|
| |
| full_recommendations = "\n".join(cleaning_recommendations) |
| st.write(full_recommendations) |
| |
| |
| cleaning_ops = [op.strip() for op in full_recommendations.split('\n') if op.strip()] |
| st.session_state.cleaning_operations[name] = [] |
| for op in cleaning_ops: |
| if st.checkbox(op, key=f"{name}_{op}"): |
| st.session_state.cleaning_operations[name].append(op) |
|
|
| if st.button("Apply Cleaning and Proceed to Chat"): |
| for name, ops in st.session_state.cleaning_operations.items(): |
| df = st.session_state.dataframes[name] |
| for op in ops: |
| |
| |
| st.write(f"Applying to {name}: {op}") |
| |
| st.session_state.step = 3 |
| st.success("Cleaning operations applied. Proceeding to chat interface.") |
| st.button("Go to Chat Interface") |
|
|
| if st.button("Back to Data Upload"): |
| st.session_state.step = 1 |
| st.experimental_rerun() |
|
|
| def step_3_chat_with_data(): |
| st.subheader("Step 3: Chat with your data") |
|
|
| user_input = st.text_input("Ask a question about your data:") |
| if user_input: |
| response = process_user_input(user_input) |
| st.session_state.chat_history.append(("User", user_input)) |
| st.session_state.chat_history.append(("AI", response)) |
|
|
| for role, message in st.session_state.chat_history: |
| if role == "User": |
| st.text_area("You:", value=message, height=50, disabled=True) |
| else: |
| st.text_area("AI:", value=message, height=100, disabled=True) |
|
|
| def process_user_input(user_input): |
| llm = OpenAI(temperature=0) |
| combined_df = pd.concat([df.assign(source=name) for name, df in st.session_state.dataframes.items()], ignore_index=True) |
|
|
| df_summary = "Available data:\n" |
| for name, df in st.session_state.dataframes.items(): |
| df_summary += f"- {name}: {len(df)} rows, {len(df.columns)} columns\n" |
| df_summary += f" Columns: {', '.join(df.columns)}\n\n" |
|
|
| agent = create_pandas_dataframe_agent( |
| llm, |
| combined_df, |
| verbose=True, |
| agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION, |
| allow_dangerous_code=True |
| ) |
|
|
| full_input = f"{df_summary}\nThe data from all files has been combined into a single DataFrame with an additional 'source' column indicating the original file.\n\nUser question: {user_input}" |
|
|
| response = agent.run(full_input) |
| return response |
|
|
| def analyze_chunk(llm, df, chunk, timeout=30): |
| agent = create_pandas_dataframe_agent( |
| llm, |
| df, |
| verbose=True, |
| agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION, |
| allow_dangerous_code=True |
| ) |
| |
| prompt = f"Analyze this part of the dataframe summary and suggest up to 3 specific cleaning operations. Focus on identifying missing values, outliers, and inconsistent data formats.\n\n{chunk}" |
| |
| try: |
| with ThreadPoolExecutor() as executor: |
| future = executor.submit(agent.run, prompt) |
| return future.result(timeout=timeout) |
| except Exception as e: |
| return f"Analysis timed out or encountered an error: {str(e)}" |
|
|
| if __name__ == "__main__": |
| main() |