LeannJoy commited on
Commit
1f78d55
·
verified ·
1 Parent(s): 582aac3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -130
app.py CHANGED
@@ -1,140 +1,91 @@
1
  import streamlit as st
2
  import pandas as pd
3
- import time
4
-
5
- # --- Import Utilities ---
6
- # IMPORTANT: Both utils.py and app.py MUST be in the same directory for this import to work.
7
- try:
8
- from utils import initialize_hf_agent, steps_eda, function_agent, handle_start_button_click
9
- except ImportError:
10
- st.error("ERROR: Could not import 'utils.py'. Please ensure 'utils.py' is in the same folder as 'app.py'.")
11
- st.stop()
12
-
13
-
14
- # --- Configuration Constants ---
15
- # Available LLM Models for selection (Now only the selected HuggingFace model)
16
- HUGGINGFACE_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"
17
-
18
-
19
- # --- Streamlit UI Setup ---
20
- st.set_page_config(layout="wide", page_title="AI Data Science Assistant")
21
- st.title("AI Assistant for Data Science 🤖")
22
- st.write("Hello, 👋 I am your AI assistant and I am here to help you with your data science projects.")
23
-
24
- # --- Sidebar: Model Selection and Explanation ---
25
- with st.sidebar:
26
- st.write("Your Data Science Adventure Begins with an CSV File. ")
27
- st.caption("You may already know that every exciting data science journey starts with a CSV file. Upload your CSV file to get started!")
28
-
29
- st.divider()
30
-
31
- # Model Information Expander
32
- with st.expander("LLM Engine Details"):
33
- st.info("Using **Hugging Face Hub** as the LLM Engine.")
34
- st.write(f"**Model:** `{HUGGINGFACE_MODEL}`")
35
- st.caption("This requires the **HUGGINGFACEHUB_API_TOKEN** to be set in your Streamlit secrets or environment variables.")
36
 
37
- # Store the single model in session state for consistency with the original structure
38
- st.session_state.selected_model = HUGGINGFACE_MODEL
39
-
40
- st.divider()
41
- st.caption("Developed by [Your Name]")
42
-
43
-
44
- # --- Session State Initialization ---
45
-
46
- # Initialize the key in session state for the start button
47
- if "clicked" not in st.session_state:
48
- st.session_state.clicked = {1: False}
49
- if 'df' not in st.session_state:
50
- st.session_state.df = None
51
- if 'pandas_agent' not in st.session_state:
52
- st.session_state.pandas_agent = None
53
- if 'llm' not in st.session_state:
54
- st.session_state.llm = None
55
- if 'selected_model' not in st.session_state:
56
- st.session_state.selected_model = HUGGINGFACE_MODEL
57
-
58
-
59
- # The button calls the imported utility function
60
- st.button("Let's Get Started!", on_click=handle_start_button_click, args=[1])
61
-
62
-
63
- # --- Main Application Logic ---
64
- if st.session_state.clicked[1]:
65
- user_csv = st.file_uploader("Upload your CSV file", type="csv")
66
-
67
- if user_csv is not None:
68
-
69
- # 1. Load Data
70
- user_csv.seek(0)
71
- df = pd.read_csv(user_csv, low_memory=False)
72
- st.session_state.df = df
73
-
74
- # 2. Initialize Agent (uses imported utility function)
75
- # Note: The function name is changed to initialize_hf_agent
76
- st.session_state.llm, st.session_state.pandas_agent = initialize_hf_agent(df, st.session_state.selected_model)
77
-
78
- # Check if agent is successfully initialized
79
- if st.session_state.pandas_agent is None:
80
- # The error message is already shown in utils.py on failure
81
- st.stop()
82
-
83
- # --- Main Execution Flow ---
84
-
85
- st.header("Exploratory Data Analysis (EDA)")
86
-
87
- # Display EDA Steps in Expander (uses imported utility function)
88
- with st.sidebar:
89
- with st.expander("What are the steps of EDA?"):
90
- # Note: The llm object is passed to steps_eda
91
- steps_text = steps_eda(st.session_state.llm)
92
- st.markdown(steps_text)
93
-
94
- # Run the initial, automated EDA sequence (uses imported utility function)
95
- function_agent(st.session_state.pandas_agent, st.session_state.df)
96
-
97
-
98
- st.divider()
99
- st.subheader("Variable of Study")
100
- user_question_variable = st.text_input("What variable would you like to analyze (e.g., 'price') and what question do you have about it? (e.g., 'What is the distribution of age?')")
101
 
102
- if user_question_variable:
103
- st.info(f"Analyzing variable: **{user_question_variable}**")
104
-
105
- # Efficient combined prompt for analysis
106
- variable_analysis_prompt = (
107
- f"Analyze the variable {user_question_variable}. "
108
- f"Specifically, provide summary statistics (mean, median, mode, quartiles), "
109
- f"identify any outliers, and check for missing values. "
110
- f"Also, use your plotting tool to create a histogram or a box plot for this variable. "
111
- f"Output the plot code separately."
112
- )
 
 
 
 
 
 
 
 
 
 
 
113
 
114
- with st.spinner("Running deep variable analysis..."):
115
- start_time = time.time()
116
- response = st.session_state.pandas_agent.run(variable_analysis_prompt)
117
- end_time = time.time()
118
-
119
- st.write(response)
120
 
121
- st.info(f"Analysis Time: **{end_time - start_time:.2f} seconds**")
122
- st.warning("The agent will output Python code for plots. You would need to manually execute this code to visualize it.")
123
-
124
 
125
- st.divider()
126
- st.subheader("Further Study")
 
 
 
 
 
127
 
128
- user_question_dataframe = st.text_input("Do you have any other final questions about the dataset or need a complex visualization? (e.g., 'What is the correlation between age and salary?')")
 
 
 
 
 
129
 
130
- if user_question_dataframe:
131
- st.info(f"Final question: **{user_question_dataframe}**")
 
 
 
 
 
 
 
 
 
132
 
133
- with st.spinner("Running final analysis..."):
134
- final_response = st.session_state.pandas_agent.run(user_question_dataframe)
135
-
136
- st.write(final_response)
137
-
138
- st.divider()
139
- st.header("Data Science Problem")
140
- st.write("Now that we have a solid grasp of the data at hand and a clear understanding of the variables we intend to investigate, it's time to define the specific data science problem we aim to solve. This step is crucial as it sets the direction for our analysis and helps us determine the appropriate methodologies and techniques to employ.")
 
 
1
  import streamlit as st
2
  import pandas as pd
3
+ from utils import perform_eda, query_agent
4
+ import io
5
+
6
+ # --- Page Configuration ---
7
+ st.set_page_config(
8
+ page_title="NL Data Analysis Agent",
9
+ layout="wide",
10
+ initial_sidebar_state="expanded"
11
+ )
12
+
13
+ def main():
14
+ """Main function to run the Streamlit application."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ st.title("📊 Natural Language Data Analysis Agent")
17
+ st.caption("Upload a CSV file and ask questions about your data using plain English.")
18
+
19
+ # --- Sidebar for Instructions ---
20
+ with st.sidebar:
21
+ st.header("Instructions")
22
+ st.markdown("""
23
+ 1. **Upload** your CSV file in the main area.
24
+ 2. **Review** the automatic Exploratory Data Analysis (EDA) summary.
25
+ 3. **Ask** your question in the text box. The system uses a LangChain Agent to run Python/Pandas code on your data and provide an answer.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
+ **Examples of Queries:**
28
+ - "What is the average income?"
29
+ - "Show a histogram of the 'Age' column."
30
+ - "Calculate the correlation matrix and identify the strongest relationship."
31
+ - "Perform a bivariate analysis of 'Gender' and 'Purchase_Amount'."
32
+ """)
33
+ st.warning("⚠️ **Note on LLM:** This app uses the Gemini API via LangChain. For production use with a truly open-source LLM, you would need to replace the LLM initialization in `utils.py` with a compatible self-hosted or endpoint-based model.")
34
+
35
+ # --- File Uploader ---
36
+ uploaded_file = st.file_uploader(
37
+ "Upload a CSV file",
38
+ type="csv",
39
+ help="The CSV file should be structured with headers."
40
+ )
41
+
42
+ if uploaded_file is not None:
43
+ try:
44
+ # Read the CSV file into a Pandas DataFrame
45
+ # We use StringIO to ensure compatibility across environments
46
+ uploaded_file.seek(0)
47
+ data = uploaded_file.read()
48
+ df = pd.read_csv(io.StringIO(data.decode('utf-8')))
49
 
50
+ st.success("CSV file successfully loaded!")
 
 
 
 
 
51
 
52
+ # --- Data Preview ---
53
+ st.subheader("Data Preview")
54
+ st.dataframe(df.head())
55
 
56
+ # --- Automated EDA ---
57
+ with st.expander("🔬 Automated Exploratory Data Analysis (EDA)", expanded=False):
58
+ eda_summary = perform_eda(df)
59
+ st.markdown(eda_summary)
60
+
61
+ # --- Natural Language Query Interface ---
62
+ st.subheader("Ask a Question about the Data")
63
 
64
+ # Text input for the user's query
65
+ user_query = st.text_area(
66
+ "Enter your data question (e.g., 'Plot the distribution of Age' or 'What is the median salary?'):",
67
+ placeholder="Ask me anything about your data...",
68
+ key="user_query_input"
69
+ )
70
 
71
+ # Button to trigger the agent
72
+ if st.button("Run Analysis", use_container_width=True, type="primary") and user_query:
73
+ with st.spinner(f"Analyzing data with LangChain Agent..."):
74
+ # Call the agent function from utils.py
75
+ agent_response = query_agent(df, user_query)
76
+
77
+ st.success("Analysis Complete!")
78
+ st.markdown("---")
79
+ st.subheader("Agent Response")
80
+ # Display the response
81
+ st.info(agent_response)
82
 
83
+ except Exception as e:
84
+ st.error(f"An error occurred while processing the file: {e}")
85
+ st.warning("Please ensure your CSV file is correctly formatted.")
86
+
87
+ else:
88
+ st.info("Awaiting CSV file upload.")
89
+
90
+ if __name__ == "__main__":
91
+ main()