Spaces:

chirpy123
/

DDD

Runtime error

App Files Files Community

DDD / app.py

chirpy123

Add application file

d530b89 over 1 year ago

raw

history blame contribute delete

6.32 kB

	import streamlit as st
	import pandas as pd
	import plotly.express as px
	from langchain_community.llms import OpenAI
	from langchain.agents.agent_types import AgentType
	from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
	import textwrap
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from functools import partial
	import time

	# Initialize session state
	if 'step' not in st.session_state:
	st.session_state.step = 1
	if 'dataframes' not in st.session_state:
	st.session_state.dataframes = {}
	if 'chat_history' not in st.session_state:
	st.session_state.chat_history = []
	if 'cleaning_operations' not in st.session_state:
	st.session_state.cleaning_operations = {}

	def main():
	st.title("Data Analysis Chat App")

	if st.session_state.step == 1:
	step_1_upload_and_analyze()
	elif st.session_state.step == 2:
	step_2_clean_data()
	elif st.session_state.step == 3:
	step_3_chat_with_data()

	def step_1_upload_and_analyze():
	st.subheader("Step 1: Upload and Analyze Data")

	uploaded_files = st.file_uploader("Upload CSV files", type="csv", accept_multiple_files=True)
	if uploaded_files:
	for file in uploaded_files:
	df = pd.read_csv(file)
	st.session_state.dataframes[file.name] = df
	st.success(f"Uploaded: {file.name}")

	if st.button("Analyze Data"):
	for name, df in st.session_state.dataframes.items():
	st.write(f"Analysis for {name}:")
	st.write(f"Shape: {df.shape}")
	st.write("Columns:")
	st.write(df.columns.tolist())
	st.write("Preview:")
	st.write(df.head())
	st.write("---")

	if st.button("Proceed to Data Cleaning"):
	st.session_state.step = 2

	def step_2_clean_data():
	st.subheader("Step 2: Clean Data")

	llm = OpenAI(temperature=0)

	for name, df in st.session_state.dataframes.items():
	st.write(f"Cleaning recommendations for {name}:")

	# Create a summary of the dataframe
	summary = f"Dataframe '{name}' summary:\n"
	summary += f"- Shape: {df.shape}\n"
	summary += f"- Columns: {', '.join(df.columns)}\n"
	summary += "- Data types:\n"
	for col, dtype in df.dtypes.items():
	summary += f" - {col}: {dtype}\n"
	summary += "- Sample data (first 5 rows):\n"
	summary += df.head().to_string()

	# Split the summary into smaller chunks
	chunk_size = 1500 # Reduced chunk size
	chunks = textwrap.wrap(summary, chunk_size)

	cleaning_recommendations = []
	with st.spinner("Analyzing data and generating recommendations..."):
	for i, chunk in enumerate(chunks):
	chunk_result = analyze_chunk(llm, df, chunk)
	cleaning_recommendations.append(chunk_result)

	# Combine all recommendations
	full_recommendations = "\n".join(cleaning_recommendations)
	st.write(full_recommendations)

	# Create checkboxes for cleaning operations
	cleaning_ops = [op.strip() for op in full_recommendations.split('\n') if op.strip()]
	st.session_state.cleaning_operations[name] = []
	for op in cleaning_ops:
	if st.checkbox(op, key=f"{name}_{op}"):
	st.session_state.cleaning_operations[name].append(op)

	if st.button("Apply Cleaning and Proceed to Chat"):
	for name, ops in st.session_state.cleaning_operations.items():
	df = st.session_state.dataframes[name]
	for op in ops:
	# Here you would implement the actual cleaning operations
	# For now, we'll just print what would be done
	st.write(f"Applying to {name}: {op}")

	st.session_state.step = 3
	st.success("Cleaning operations applied. Proceeding to chat interface.")
	st.button("Go to Chat Interface")

	if st.button("Back to Data Upload"):
	st.session_state.step = 1
	st.experimental_rerun()

	def step_3_chat_with_data():
	st.subheader("Step 3: Chat with your data")

	user_input = st.text_input("Ask a question about your data:")
	if user_input:
	response = process_user_input(user_input)
	st.session_state.chat_history.append(("User", user_input))
	st.session_state.chat_history.append(("AI", response))

	for role, message in st.session_state.chat_history:
	if role == "User":
	st.text_area("You:", value=message, height=50, disabled=True)
	else:
	st.text_area("AI:", value=message, height=100, disabled=True)

	def process_user_input(user_input):
	llm = OpenAI(temperature=0)
	combined_df = pd.concat([df.assign(source=name) for name, df in st.session_state.dataframes.items()], ignore_index=True)

	df_summary = "Available data:\n"
	for name, df in st.session_state.dataframes.items():
	df_summary += f"- {name}: {len(df)} rows, {len(df.columns)} columns\n"
	df_summary += f" Columns: {', '.join(df.columns)}\n\n"

	agent = create_pandas_dataframe_agent(
	llm,
	combined_df,
	verbose=True,
	agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
	allow_dangerous_code=True
	)

	full_input = f"{df_summary}\nThe data from all files has been combined into a single DataFrame with an additional 'source' column indicating the original file.\n\nUser question: {user_input}"

	response = agent.run(full_input)
	return response

	def analyze_chunk(llm, df, chunk, timeout=30):
	agent = create_pandas_dataframe_agent(
	llm,
	df,
	verbose=True,
	agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
	allow_dangerous_code=True
	)

	prompt = f"Analyze this part of the dataframe summary and suggest up to 3 specific cleaning operations. Focus on identifying missing values, outliers, and inconsistent data formats.\n\n{chunk}"

	try:
	with ThreadPoolExecutor() as executor:
	future = executor.submit(agent.run, prompt)
	return future.result(timeout=timeout)
	except Exception as e:
	return f"Analysis timed out or encountered an error: {str(e)}"

	if __name__ == "__main__":
	main()