import streamlit as st import pandas as pd import plotly.express as px from langchain_community.llms import OpenAI from langchain.agents.agent_types import AgentType from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent import textwrap from concurrent.futures import ThreadPoolExecutor, as_completed from functools import partial import time # Initialize session state if 'step' not in st.session_state: st.session_state.step = 1 if 'dataframes' not in st.session_state: st.session_state.dataframes = {} if 'chat_history' not in st.session_state: st.session_state.chat_history = [] if 'cleaning_operations' not in st.session_state: st.session_state.cleaning_operations = {} def main(): st.title("D^3- Data Dialogue Decisions") if st.session_state.step == 1: step_1_upload_and_analyze() # elif st.session_state.step == 2: # step_2_clean_data() elif st.session_state.step == 3: step_3_chat_with_data() def step_1_upload_and_analyze(): st.subheader("Step 1: Upload and Analyze Data") uploaded_files = st.file_uploader("Upload CSV files", type="csv", accept_multiple_files=True) if uploaded_files: for file in uploaded_files: df = pd.read_csv(file) st.session_state.dataframes[file.name] = df st.success(f"Uploaded: {file.name}") if st.button("Analyze Data"): for name, df in st.session_state.dataframes.items(): st.write(f"Analysis for {name}:") st.write(f"Shape: {df.shape}") st.write("Columns:") st.write(df.columns.tolist()) st.write("Preview:") st.write(df.head()) st.write("---") if st.button("Explore Data"): st.session_state.step = 3 # def step_2_clean_data(): # st.subheader("Step 2: Clean Data") # llm = OpenAI(temperature=0) # for name, df in st.session_state.dataframes.items(): # st.write(f"Cleaning recommendations for {name}:") # # Create a summary of the dataframe # summary = f"Dataframe '{name}' summary:\n" # summary += f"- Shape: {df.shape}\n" # summary += f"- Columns: {', '.join(df.columns)}\n" # summary += "- Data types:\n" # for col, dtype in df.dtypes.items(): # summary += f" - {col}: {dtype}\n" # summary += "- Sample data (first 5 rows):\n" # summary += df.head().to_string() # # Split the summary into smaller chunks # chunk_size = 1500 # Reduced chunk size # chunks = textwrap.wrap(summary, chunk_size) # cleaning_recommendations = [] # with st.spinner("Analyzing data and generating recommendations..."): # for i, chunk in enumerate(chunks): # chunk_result = analyze_chunk(llm, df, chunk) # cleaning_recommendations.append(chunk_result) # # Combine all recommendations # full_recommendations = "\n".join(cleaning_recommendations) # st.write(full_recommendations) # # Create checkboxes for cleaning operations # cleaning_ops = [op.strip() for op in full_recommendations.split('\n') if op.strip()] # st.session_state.cleaning_operations[name] = [] # for op in cleaning_ops: # if st.checkbox(op, key=f"{name}_{op}"): # st.session_state.cleaning_operations[name].append(op) # if st.button("Apply Cleaning and Proceed to Chat"): # for name, ops in st.session_state.cleaning_operations.items(): # df = st.session_state.dataframes[name] # for op in ops: # # Here you would implement the actual cleaning operations # # For now, we'll just print what would be done # st.write(f"Applying to {name}: {op}") # st.session_state.step = 3 # st.success("Cleaning operations applied. Proceeding to chat interface.") # st.button("Go to Chat Interface") # if st.button("Back to Data Upload"): # st.session_state.step = 1 # st.experimental_rerun() def step_3_chat_with_data(): st.subheader("Step 3: Chat with your data") user_input = st.text_input("Ask a question about your data:") if user_input: response = process_user_input(user_input) st.session_state.chat_history.append(("User", user_input)) st.session_state.chat_history.append(("AI", response)) for role, message in reversed(st.session_state.chat_history): if role == "User": st.text_area("You:", value=message, height=50, disabled=True) else: st.write(message) def process_user_input(user_input): llm = OpenAI(temperature=0) combined_df = pd.concat([df.assign(source=name) for name, df in st.session_state.dataframes.items()], ignore_index=True) df_summary = "Available data:\n" for name, df in st.session_state.dataframes.items(): df_summary += f"- {name}: {len(df)} rows, {len(df.columns)} columns\n" df_summary += f" Columns: {', '.join(df.columns)}\n\n" agent = create_pandas_dataframe_agent( llm, combined_df, verbose=True, agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION, allow_dangerous_code=True, handle_parsing_errors=True ) # full_input = f"{df_summary}\nThe data from all files has been combined into a single DataFrame with an additional 'source' column indicating the original file.\n\nUser question: {user_input}" full_input = f"{combined_df}\nAs a data analyst, process the data to answer the user question.\n\nUser question: {user_input}" response = agent.run(full_input) return response def analyze_chunk(llm, df, chunk, timeout=30): agent = create_pandas_dataframe_agent( llm, df, verbose=True, agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION, allow_dangerous_code=True ) prompt = f"Analyze this part of the dataframe summary and suggest up to 3 specific cleaning operations. Focus on identifying missing values, outliers, and inconsistent data formats.\n\n{chunk}" try: with ThreadPoolExecutor() as executor: future = executor.submit(agent.run, prompt) return future.result(timeout=timeout) except Exception as e: return f"Analysis timed out or encountered an error: {str(e)}" if __name__ == "__main__": main()