Ashwin commited on
Commit
3351f47
·
1 Parent(s): d56c4ea

Copied from other repo

Browse files
Files changed (6) hide show
  1. .env +2 -0
  2. app.py +166 -0
  3. explore.py +83 -0
  4. persistence.py +80 -0
  5. requirements.txt +19 -0
  6. tryvanna.py +23 -0
.env ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ DATABASE_URL=postgres://default:lyzegA2r0ESO@ep-dawn-fire-a1i3ytre-pooler.ap-southeast-1.aws.neon.tech/verceldb
2
+ VANNA_API_KEY=370dd4dc5e75478f88c71f4db5cca094
app.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import plotly.express as px
4
+ from langchain_community.llms import OpenAI
5
+ from langchain.agents.agent_types import AgentType
6
+ from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
7
+ import textwrap
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+ from functools import partial
10
+ import time
11
+
12
+ # Initialize session state
13
+ if 'step' not in st.session_state:
14
+ st.session_state.step = 1
15
+ if 'dataframes' not in st.session_state:
16
+ st.session_state.dataframes = {}
17
+ if 'chat_history' not in st.session_state:
18
+ st.session_state.chat_history = []
19
+ if 'cleaning_operations' not in st.session_state:
20
+ st.session_state.cleaning_operations = {}
21
+
22
+ def main():
23
+ st.title("Data Analysis Chat App")
24
+
25
+ if st.session_state.step == 1:
26
+ step_1_upload_and_analyze()
27
+ elif st.session_state.step == 2:
28
+ step_2_clean_data()
29
+ elif st.session_state.step == 3:
30
+ step_3_chat_with_data()
31
+
32
+ def step_1_upload_and_analyze():
33
+ st.subheader("Step 1: Upload and Analyze Data")
34
+
35
+ uploaded_files = st.file_uploader("Upload CSV files", type="csv", accept_multiple_files=True)
36
+ if uploaded_files:
37
+ for file in uploaded_files:
38
+ df = pd.read_csv(file)
39
+ st.session_state.dataframes[file.name] = df
40
+ st.success(f"Uploaded: {file.name}")
41
+
42
+ if st.button("Analyze Data"):
43
+ for name, df in st.session_state.dataframes.items():
44
+ st.write(f"Analysis for {name}:")
45
+ st.write(f"Shape: {df.shape}")
46
+ st.write("Columns:")
47
+ st.write(df.columns.tolist())
48
+ st.write("Preview:")
49
+ st.write(df.head())
50
+ st.write("---")
51
+
52
+ if st.button("Proceed to Data Cleaning"):
53
+ st.session_state.step = 2
54
+
55
+ def step_2_clean_data():
56
+ st.subheader("Step 2: Clean Data")
57
+
58
+ llm = OpenAI(temperature=0)
59
+
60
+ for name, df in st.session_state.dataframes.items():
61
+ st.write(f"Cleaning recommendations for {name}:")
62
+
63
+ # Create a summary of the dataframe
64
+ summary = f"Dataframe '{name}' summary:\n"
65
+ summary += f"- Shape: {df.shape}\n"
66
+ summary += f"- Columns: {', '.join(df.columns)}\n"
67
+ summary += "- Data types:\n"
68
+ for col, dtype in df.dtypes.items():
69
+ summary += f" - {col}: {dtype}\n"
70
+ summary += "- Sample data (first 5 rows):\n"
71
+ summary += df.head().to_string()
72
+
73
+ # Split the summary into smaller chunks
74
+ chunk_size = 1500 # Reduced chunk size
75
+ chunks = textwrap.wrap(summary, chunk_size)
76
+
77
+ cleaning_recommendations = []
78
+ with st.spinner("Analyzing data and generating recommendations..."):
79
+ for i, chunk in enumerate(chunks):
80
+ chunk_result = analyze_chunk(llm, df, chunk)
81
+ cleaning_recommendations.append(chunk_result)
82
+
83
+ # Combine all recommendations
84
+ full_recommendations = "\n".join(cleaning_recommendations)
85
+ st.write(full_recommendations)
86
+
87
+ # Create checkboxes for cleaning operations
88
+ cleaning_ops = [op.strip() for op in full_recommendations.split('\n') if op.strip()]
89
+ st.session_state.cleaning_operations[name] = []
90
+ for op in cleaning_ops:
91
+ if st.checkbox(op, key=f"{name}_{op}"):
92
+ st.session_state.cleaning_operations[name].append(op)
93
+
94
+ if st.button("Apply Cleaning and Proceed to Chat"):
95
+ for name, ops in st.session_state.cleaning_operations.items():
96
+ df = st.session_state.dataframes[name]
97
+ for op in ops:
98
+ # Here you would implement the actual cleaning operations
99
+ # For now, we'll just print what would be done
100
+ st.write(f"Applying to {name}: {op}")
101
+
102
+ st.session_state.step = 3
103
+ st.success("Cleaning operations applied. Proceeding to chat interface.")
104
+ st.button("Go to Chat Interface")
105
+
106
+ if st.button("Back to Data Upload"):
107
+ st.session_state.step = 1
108
+ st.experimental_rerun()
109
+
110
+ def step_3_chat_with_data():
111
+ st.subheader("Step 3: Chat with your data")
112
+
113
+ user_input = st.text_input("Ask a question about your data:")
114
+ if user_input:
115
+ response = process_user_input(user_input)
116
+ st.session_state.chat_history.append(("User", user_input))
117
+ st.session_state.chat_history.append(("AI", response))
118
+
119
+ for role, message in st.session_state.chat_history:
120
+ if role == "User":
121
+ st.text_area("You:", value=message, height=50, disabled=True)
122
+ else:
123
+ st.text_area("AI:", value=message, height=100, disabled=True)
124
+
125
+ def process_user_input(user_input):
126
+ llm = OpenAI(temperature=0)
127
+ combined_df = pd.concat([df.assign(source=name) for name, df in st.session_state.dataframes.items()], ignore_index=True)
128
+
129
+ df_summary = "Available data:\n"
130
+ for name, df in st.session_state.dataframes.items():
131
+ df_summary += f"- {name}: {len(df)} rows, {len(df.columns)} columns\n"
132
+ df_summary += f" Columns: {', '.join(df.columns)}\n\n"
133
+
134
+ agent = create_pandas_dataframe_agent(
135
+ llm,
136
+ combined_df,
137
+ verbose=True,
138
+ agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
139
+ allow_dangerous_code=True
140
+ )
141
+
142
+ full_input = f"{df_summary}\nThe data from all files has been combined into a single DataFrame with an additional 'source' column indicating the original file.\n\nUser question: {user_input}"
143
+
144
+ response = agent.run(full_input)
145
+ return response
146
+
147
+ def analyze_chunk(llm, df, chunk, timeout=30):
148
+ agent = create_pandas_dataframe_agent(
149
+ llm,
150
+ df,
151
+ verbose=True,
152
+ agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
153
+ allow_dangerous_code=True
154
+ )
155
+
156
+ prompt = f"Analyze this part of the dataframe summary and suggest up to 3 specific cleaning operations. Focus on identifying missing values, outliers, and inconsistent data formats.\n\n{chunk}"
157
+
158
+ try:
159
+ with ThreadPoolExecutor() as executor:
160
+ future = executor.submit(agent.run, prompt)
161
+ return future.result(timeout=timeout)
162
+ except Exception as e:
163
+ return f"Analysis timed out or encountered an error: {str(e)}"
164
+
165
+ if __name__ == "__main__":
166
+ main()
explore.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from vanna import VannaBase
4
+ import pandas as pd
5
+ from dotenv import load_dotenv
6
+ from sqlalchemy import create_engine
7
+ from sqlalchemy.exc import SQLAlchemyError
8
+
9
+ # Load environment variables
10
+ load_dotenv()
11
+
12
+ # Initialize Vanna AI
13
+ vanna_api_key = os.getenv("VANNA_API_KEY")
14
+ if not vanna_api_key:
15
+ st.error("VANNA_API_KEY is not set in the environment variables. Please set it and restart the application.")
16
+ st.stop()
17
+
18
+ vn = VannaBase(api_key=vanna_api_key)
19
+
20
+ # Check if DATABASE_URL is set
21
+ database_url = os.getenv("DATABASE_URL")
22
+ if not database_url:
23
+ st.error("DATABASE_URL is not set in the environment variables. Please set it and restart the application.")
24
+ st.stop()
25
+
26
+ # Try to connect to the database
27
+ try:
28
+ engine = create_engine(database_url)
29
+ with engine.connect() as connection:
30
+ st.success("Successfully connected to the database.")
31
+ vn.connect_to_postgres(database_url)
32
+ except SQLAlchemyError as e:
33
+ st.error(f"Failed to connect to the database: {str(e)}")
34
+ st.stop()
35
+
36
+ st.title("Data Explorer")
37
+
38
+ # Initialize chat history
39
+ if "messages" not in st.session_state:
40
+ st.session_state.messages = []
41
+
42
+ # Display chat messages
43
+ for message in st.session_state.messages:
44
+ with st.chat_message(message["role"]):
45
+ st.markdown(message["content"])
46
+
47
+ # Chat input
48
+ if prompt := st.chat_input("Ask about your data"):
49
+ # Add user message to chat history
50
+ st.session_state.messages.append({"role": "user", "content": prompt})
51
+
52
+ # Display user message
53
+ with st.chat_message("user"):
54
+ st.markdown(prompt)
55
+
56
+ try:
57
+ # Generate SQL query
58
+ sql_query = vn.generate_sql(prompt)
59
+
60
+ # Execute SQL query and get results
61
+ df = vn.run_sql(sql_query)
62
+
63
+ # Display assistant response
64
+ with st.chat_message("assistant"):
65
+ st.markdown(f"Here's the SQL query I generated:\n```sql\n{sql_query}\n```")
66
+ st.markdown("And here are the results:")
67
+ st.dataframe(df)
68
+
69
+ # Add assistant message to chat history
70
+ st.session_state.messages.append({
71
+ "role": "assistant",
72
+ "content": f"Here's the SQL query I generated:\n```sql\n{sql_query}\n```\n\nAnd here are the results:\n{df.to_markdown()}"
73
+ })
74
+ except Exception as e:
75
+ st.error(f"An error occurred: {str(e)}")
76
+
77
+ # Sidebar with additional information
78
+ st.sidebar.header("About")
79
+ st.sidebar.info(
80
+ "This is a data exploration tool using Streamlit and Vanna AI. "
81
+ "Ask questions about your data in natural language, and the app will "
82
+ "generate SQL queries and display the results."
83
+ )
persistence.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import time
4
+ from sqlalchemy import create_engine, Column, String, Integer, Float, DateTime, inspect, MetaData
5
+ from sqlalchemy.orm import declarative_base
6
+ from sqlalchemy.exc import SQLAlchemyError
7
+
8
+
9
+ DATABASE_URL = os.environ.get('DATABASE_URL')
10
+ engine = create_engine(DATABASE_URL)
11
+
12
+
13
+ def get_df_from_csv(csv_file_path):
14
+ df = pd.read_csv(csv_file_path)
15
+ return df
16
+
17
+ def get_schema_from_df(df):
18
+ schema = pd.io.json.build_table_schema(df)
19
+ return schema
20
+
21
+ def create_table_from_schema(table_name, schema):
22
+ Base = declarative_base()
23
+
24
+ inspector = inspect(engine)
25
+ metadata = MetaData()
26
+ metadata.reflect(bind=engine)
27
+
28
+ # Check if table already exists
29
+ if table_name in inspector.get_table_names():
30
+ existing_columns = {column['name']: column['type'] for column in inspector.get_columns(table_name)}
31
+ new_columns = {field['name']: field['type'] for field in schema['fields']}
32
+
33
+ if existing_columns == new_columns:
34
+ print(f"Table '{table_name}' with the same schema already exists. Skipping creation.")
35
+ return
36
+ else:
37
+ print(f"Table '{table_name}' exists but has a different schema. Creating a new table with a timestamp suffix.")
38
+ table_name = f"{table_name}_{int(time.time())}"
39
+
40
+ class DynamicTable(Base):
41
+ __tablename__ = table_name
42
+
43
+ id = Column(Integer, primary_key=True)
44
+
45
+ for column in schema['fields']:
46
+ if column['name'] != 'id':
47
+ if column['type'] == 'integer':
48
+ locals()[column['name']] = Column(Integer)
49
+ elif column['type'] == 'number':
50
+ locals()[column['name']] = Column(Float)
51
+ elif column['type'] == 'datetime':
52
+ locals()[column['name']] = Column(DateTime)
53
+ else:
54
+ locals()[column['name']] = Column(String)
55
+
56
+ try:
57
+ Base.metadata.create_all(engine)
58
+ print(f"Table '{table_name}' created successfully.")
59
+ except SQLAlchemyError as e:
60
+ print(f"Error creating table: {str(e)}")
61
+
62
+
63
+ def save_data_to_table(table_name, df):
64
+ try:
65
+ df.to_sql(table_name, engine)
66
+ except SQLAlchemyError as e:
67
+ print(f"Error saving data to table: {str(e)}")
68
+
69
+
70
+
71
+ if __name__ == "__main__":
72
+ filename = 'data.csv'
73
+ df = get_df_from_csv(filename)
74
+ schema = get_schema_from_df(df)
75
+ table_name = filename.split('.')[0]
76
+
77
+ create_table_from_schema(table_name, schema)
78
+ save_data_to_table(table_name, df)
79
+
80
+
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ plotly==5.24.1
2
+ langchain==0.3.4
3
+ langchain-community==0.3.3
4
+ langchain-core==0.3.12
5
+ langchain-experimental==0.3.2
6
+ langchain-openai==0.2.3
7
+ langchain-text-splitters==0.3.0
8
+ tabulate==0.9.0
9
+ vanna==0.7.3
10
+ psycopg2-binary
11
+ psycopg2
12
+ streamlit==1.31.0
13
+ pandas==2.2.0
14
+ python-dotenv==1.0.0
15
+ sqlalchemy==2.0.25
16
+
17
+
18
+
19
+ >>>>>>> 8798f85 (add deps)
tryvanna.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from vanna.remote import VannaDefault
4
+ vn = VannaDefault(model='gpt-3.5-turbo', api_key=os.getenv("VANNA_API_KEY"))
5
+
6
+ # # vn.connect_to_postgres(os.getenv("DATABASE_URL"))
7
+ # export DATABASE_NAME=verceldb
8
+ # ashwin@MacBook-Air-6 DDD % export DATABASE_USER=default
9
+ # ashwin@MacBook-Air-6 DDD % export DATABASE_PASSWORD=lyzegA2r0ESO
10
+ # ashwin@MacBook-Air-6 DDD % export DATABASE_HOST="ep-dawn-fire-a1i3ytre-pooler.ap-southeast-1.aws.neon.tech"
11
+
12
+ db_host = os.getenv("DATABASE_HOST")
13
+ db_name = os.getenv("DATABASE_NAME")
14
+ db_user = os.getenv("DATABASE_USER")
15
+ db_password = os.getenv("DATABASE_PASSWORD")
16
+ db_port = 5432
17
+
18
+ vn.connect_to_postgres(host=db_host, dbname=db_name, user=db_user, password=db_password, port=db_port)
19
+
20
+ vn.ask('What are the top 10 artists by sales?')
21
+
22
+ from vanna.flask import VannaFlaskApp
23
+ VannaFlaskApp(vn).run()