chirpy123 commited on
Commit
d530b89
·
1 Parent(s): 0475eb5

Add application file

Browse files
Files changed (1) hide show
  1. app.py +166 -0
app.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import plotly.express as px
4
+ from langchain_community.llms import OpenAI
5
+ from langchain.agents.agent_types import AgentType
6
+ from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
7
+ import textwrap
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+ from functools import partial
10
+ import time
11
+
12
+ # Initialize session state
13
+ if 'step' not in st.session_state:
14
+ st.session_state.step = 1
15
+ if 'dataframes' not in st.session_state:
16
+ st.session_state.dataframes = {}
17
+ if 'chat_history' not in st.session_state:
18
+ st.session_state.chat_history = []
19
+ if 'cleaning_operations' not in st.session_state:
20
+ st.session_state.cleaning_operations = {}
21
+
22
+ def main():
23
+ st.title("Data Analysis Chat App")
24
+
25
+ if st.session_state.step == 1:
26
+ step_1_upload_and_analyze()
27
+ elif st.session_state.step == 2:
28
+ step_2_clean_data()
29
+ elif st.session_state.step == 3:
30
+ step_3_chat_with_data()
31
+
32
+ def step_1_upload_and_analyze():
33
+ st.subheader("Step 1: Upload and Analyze Data")
34
+
35
+ uploaded_files = st.file_uploader("Upload CSV files", type="csv", accept_multiple_files=True)
36
+ if uploaded_files:
37
+ for file in uploaded_files:
38
+ df = pd.read_csv(file)
39
+ st.session_state.dataframes[file.name] = df
40
+ st.success(f"Uploaded: {file.name}")
41
+
42
+ if st.button("Analyze Data"):
43
+ for name, df in st.session_state.dataframes.items():
44
+ st.write(f"Analysis for {name}:")
45
+ st.write(f"Shape: {df.shape}")
46
+ st.write("Columns:")
47
+ st.write(df.columns.tolist())
48
+ st.write("Preview:")
49
+ st.write(df.head())
50
+ st.write("---")
51
+
52
+ if st.button("Proceed to Data Cleaning"):
53
+ st.session_state.step = 2
54
+
55
+ def step_2_clean_data():
56
+ st.subheader("Step 2: Clean Data")
57
+
58
+ llm = OpenAI(temperature=0)
59
+
60
+ for name, df in st.session_state.dataframes.items():
61
+ st.write(f"Cleaning recommendations for {name}:")
62
+
63
+ # Create a summary of the dataframe
64
+ summary = f"Dataframe '{name}' summary:\n"
65
+ summary += f"- Shape: {df.shape}\n"
66
+ summary += f"- Columns: {', '.join(df.columns)}\n"
67
+ summary += "- Data types:\n"
68
+ for col, dtype in df.dtypes.items():
69
+ summary += f" - {col}: {dtype}\n"
70
+ summary += "- Sample data (first 5 rows):\n"
71
+ summary += df.head().to_string()
72
+
73
+ # Split the summary into smaller chunks
74
+ chunk_size = 1500 # Reduced chunk size
75
+ chunks = textwrap.wrap(summary, chunk_size)
76
+
77
+ cleaning_recommendations = []
78
+ with st.spinner("Analyzing data and generating recommendations..."):
79
+ for i, chunk in enumerate(chunks):
80
+ chunk_result = analyze_chunk(llm, df, chunk)
81
+ cleaning_recommendations.append(chunk_result)
82
+
83
+ # Combine all recommendations
84
+ full_recommendations = "\n".join(cleaning_recommendations)
85
+ st.write(full_recommendations)
86
+
87
+ # Create checkboxes for cleaning operations
88
+ cleaning_ops = [op.strip() for op in full_recommendations.split('\n') if op.strip()]
89
+ st.session_state.cleaning_operations[name] = []
90
+ for op in cleaning_ops:
91
+ if st.checkbox(op, key=f"{name}_{op}"):
92
+ st.session_state.cleaning_operations[name].append(op)
93
+
94
+ if st.button("Apply Cleaning and Proceed to Chat"):
95
+ for name, ops in st.session_state.cleaning_operations.items():
96
+ df = st.session_state.dataframes[name]
97
+ for op in ops:
98
+ # Here you would implement the actual cleaning operations
99
+ # For now, we'll just print what would be done
100
+ st.write(f"Applying to {name}: {op}")
101
+
102
+ st.session_state.step = 3
103
+ st.success("Cleaning operations applied. Proceeding to chat interface.")
104
+ st.button("Go to Chat Interface")
105
+
106
+ if st.button("Back to Data Upload"):
107
+ st.session_state.step = 1
108
+ st.experimental_rerun()
109
+
110
+ def step_3_chat_with_data():
111
+ st.subheader("Step 3: Chat with your data")
112
+
113
+ user_input = st.text_input("Ask a question about your data:")
114
+ if user_input:
115
+ response = process_user_input(user_input)
116
+ st.session_state.chat_history.append(("User", user_input))
117
+ st.session_state.chat_history.append(("AI", response))
118
+
119
+ for role, message in st.session_state.chat_history:
120
+ if role == "User":
121
+ st.text_area("You:", value=message, height=50, disabled=True)
122
+ else:
123
+ st.text_area("AI:", value=message, height=100, disabled=True)
124
+
125
+ def process_user_input(user_input):
126
+ llm = OpenAI(temperature=0)
127
+ combined_df = pd.concat([df.assign(source=name) for name, df in st.session_state.dataframes.items()], ignore_index=True)
128
+
129
+ df_summary = "Available data:\n"
130
+ for name, df in st.session_state.dataframes.items():
131
+ df_summary += f"- {name}: {len(df)} rows, {len(df.columns)} columns\n"
132
+ df_summary += f" Columns: {', '.join(df.columns)}\n\n"
133
+
134
+ agent = create_pandas_dataframe_agent(
135
+ llm,
136
+ combined_df,
137
+ verbose=True,
138
+ agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
139
+ allow_dangerous_code=True
140
+ )
141
+
142
+ full_input = f"{df_summary}\nThe data from all files has been combined into a single DataFrame with an additional 'source' column indicating the original file.\n\nUser question: {user_input}"
143
+
144
+ response = agent.run(full_input)
145
+ return response
146
+
147
+ def analyze_chunk(llm, df, chunk, timeout=30):
148
+ agent = create_pandas_dataframe_agent(
149
+ llm,
150
+ df,
151
+ verbose=True,
152
+ agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
153
+ allow_dangerous_code=True
154
+ )
155
+
156
+ prompt = f"Analyze this part of the dataframe summary and suggest up to 3 specific cleaning operations. Focus on identifying missing values, outliers, and inconsistent data formats.\n\n{chunk}"
157
+
158
+ try:
159
+ with ThreadPoolExecutor() as executor:
160
+ future = executor.submit(agent.run, prompt)
161
+ return future.result(timeout=timeout)
162
+ except Exception as e:
163
+ return f"Analysis timed out or encountered an error: {str(e)}"
164
+
165
+ if __name__ == "__main__":
166
+ main()