LeannJoy commited on
Commit
ae15120
·
verified ·
1 Parent(s): 8f3b663

Upload 5 files

Browse files
Files changed (5) hide show
  1. .env +0 -0
  2. README.md +0 -10
  3. app.py +140 -0
  4. requirements.txt +5 -0
  5. utils.py +101 -0
.env ADDED
File without changes
README.md CHANGED
@@ -1,10 +0,0 @@
1
- ---
2
- title: DataAnalysisApp
3
- emoji: 👁
4
- colorFrom: pink
5
- colorTo: blue
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import time
4
+
5
+ # --- Import Utilities ---
6
+ # IMPORTANT: Both utils.py and app.py MUST be in the same directory for this import to work.
7
+ try:
8
+ from utils import initialize_hf_agent, steps_eda, function_agent, handle_start_button_click
9
+ except ImportError:
10
+ st.error("ERROR: Could not import 'utils.py'. Please ensure 'utils.py' is in the same folder as 'app.py'.")
11
+ st.stop()
12
+
13
+
14
+ # --- Configuration Constants ---
15
+ # Available LLM Models for selection (Now only the selected HuggingFace model)
16
+ HUGGINGFACE_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"
17
+
18
+
19
+ # --- Streamlit UI Setup ---
20
+ st.set_page_config(layout="wide", page_title="AI Data Science Assistant")
21
+ st.title("AI Assistant for Data Science 🤖")
22
+ st.write("Hello, 👋 I am your AI assistant and I am here to help you with your data science projects.")
23
+
24
+ # --- Sidebar: Model Selection and Explanation ---
25
+ with st.sidebar:
26
+ st.write("Your Data Science Adventure Begins with an CSV File. ")
27
+ st.caption("You may already know that every exciting data science journey starts with a CSV file. Upload your CSV file to get started!")
28
+
29
+ st.divider()
30
+
31
+ # Model Information Expander
32
+ with st.expander("LLM Engine Details"):
33
+ st.info("Using **Hugging Face Hub** as the LLM Engine.")
34
+ st.write(f"**Model:** `{HUGGINGFACE_MODEL}`")
35
+ st.caption("This requires the **HUGGINGFACEHUB_API_TOKEN** to be set in your Streamlit secrets or environment variables.")
36
+
37
+ # Store the single model in session state for consistency with the original structure
38
+ st.session_state.selected_model = HUGGINGFACE_MODEL
39
+
40
+ st.divider()
41
+ st.caption("Developed by [Your Name]")
42
+
43
+
44
+ # --- Session State Initialization ---
45
+
46
+ # Initialize the key in session state for the start button
47
+ if "clicked" not in st.session_state:
48
+ st.session_state.clicked = {1: False}
49
+ if 'df' not in st.session_state:
50
+ st.session_state.df = None
51
+ if 'pandas_agent' not in st.session_state:
52
+ st.session_state.pandas_agent = None
53
+ if 'llm' not in st.session_state:
54
+ st.session_state.llm = None
55
+ if 'selected_model' not in st.session_state:
56
+ st.session_state.selected_model = HUGGINGFACE_MODEL
57
+
58
+
59
+ # The button calls the imported utility function
60
+ st.button("Let's Get Started!", on_click=handle_start_button_click, args=[1])
61
+
62
+
63
+ # --- Main Application Logic ---
64
+ if st.session_state.clicked[1]:
65
+ user_csv = st.file_uploader("Upload your CSV file", type="csv")
66
+
67
+ if user_csv is not None:
68
+
69
+ # 1. Load Data
70
+ user_csv.seek(0)
71
+ df = pd.read_csv(user_csv, low_memory=False)
72
+ st.session_state.df = df
73
+
74
+ # 2. Initialize Agent (uses imported utility function)
75
+ # Note: The function name is changed to initialize_hf_agent
76
+ st.session_state.llm, st.session_state.pandas_agent = initialize_hf_agent(df, st.session_state.selected_model)
77
+
78
+ # Check if agent is successfully initialized
79
+ if st.session_state.pandas_agent is None:
80
+ # The error message is already shown in utils.py on failure
81
+ st.stop()
82
+
83
+ # --- Main Execution Flow ---
84
+
85
+ st.header("Exploratory Data Analysis (EDA)")
86
+
87
+ # Display EDA Steps in Expander (uses imported utility function)
88
+ with st.sidebar:
89
+ with st.expander("What are the steps of EDA?"):
90
+ # Note: The llm object is passed to steps_eda
91
+ steps_text = steps_eda(st.session_state.llm)
92
+ st.markdown(steps_text)
93
+
94
+ # Run the initial, automated EDA sequence (uses imported utility function)
95
+ function_agent(st.session_state.pandas_agent, st.session_state.df)
96
+
97
+
98
+ st.divider()
99
+ st.subheader("Variable of Study")
100
+ user_question_variable = st.text_input("What variable would you like to analyze (e.g., 'price') and what question do you have about it? (e.g., 'What is the distribution of age?')")
101
+
102
+ if user_question_variable:
103
+ st.info(f"Analyzing variable: **{user_question_variable}**")
104
+
105
+ # Efficient combined prompt for analysis
106
+ variable_analysis_prompt = (
107
+ f"Analyze the variable {user_question_variable}. "
108
+ f"Specifically, provide summary statistics (mean, median, mode, quartiles), "
109
+ f"identify any outliers, and check for missing values. "
110
+ f"Also, use your plotting tool to create a histogram or a box plot for this variable. "
111
+ f"Output the plot code separately."
112
+ )
113
+
114
+ with st.spinner("Running deep variable analysis..."):
115
+ start_time = time.time()
116
+ response = st.session_state.pandas_agent.run(variable_analysis_prompt)
117
+ end_time = time.time()
118
+
119
+ st.write(response)
120
+
121
+ st.info(f"Analysis Time: **{end_time - start_time:.2f} seconds**")
122
+ st.warning("The agent will output Python code for plots. You would need to manually execute this code to visualize it.")
123
+
124
+
125
+ st.divider()
126
+ st.subheader("Further Study")
127
+
128
+ user_question_dataframe = st.text_input("Do you have any other final questions about the dataset or need a complex visualization? (e.g., 'What is the correlation between age and salary?')")
129
+
130
+ if user_question_dataframe:
131
+ st.info(f"Final question: **{user_question_dataframe}**")
132
+
133
+ with st.spinner("Running final analysis..."):
134
+ final_response = st.session_state.pandas_agent.run(user_question_dataframe)
135
+
136
+ st.write(final_response)
137
+
138
+ st.divider()
139
+ st.header("Data Science Problem")
140
+ st.write("Now that we have a solid grasp of the data at hand and a clear understanding of the variables we intend to investigate, it's time to define the specific data science problem we aim to solve. This step is crucial as it sets the direction for our analysis and helps us determine the appropriate methodologies and techniques to employ.")
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ langchain-community
4
+ langchain-experimental
5
+ langchain-core
utils.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from langchain_community.llms import HuggingFaceHub # New Import for HuggingFace
4
+ from langchain_experimental.agents import create_pandas_dataframe_agent
5
+ from langchain.agents.agent_types import AgentType
6
+ from typing import Tuple, Optional
7
+ import os # New Import for environment variables/secrets
8
+
9
+
10
+ # --- Utility Functions ---
11
+
12
+ def handle_start_button_click(button_id: int):
13
+ """Function to update the clicked value in session state."""
14
+ st.session_state.clicked[button_id] = True
15
+
16
+
17
+ @st.cache_resource(show_spinner="Initializing LLM Agent...")
18
+ def initialize_hf_agent(df: pd.DataFrame, model_name: str) -> Tuple[Optional[HuggingFaceHub], Optional[create_pandas_dataframe_agent]]:
19
+ """
20
+ Initializes the HuggingFaceHub-based LangChain Pandas Agent.
21
+ This function is cached to prevent repeated, slow LLM initialization.
22
+ """
23
+
24
+ # Check for Hugging Face API Token
25
+ # Checks st.secrets first, then environment variables
26
+ hf_token = st.secrets.get("HUGGINGFACEHUB_API_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
27
+
28
+ if not hf_token:
29
+ st.error("ERROR: **HUGGINGFACEHUB_API_TOKEN** is not found. Please set it in your environment variables or in your Streamlit secrets (`.streamlit/secrets.toml`).")
30
+ return None, None
31
+
32
+ # 1. Initialize HuggingFaceHub LLM
33
+ try:
34
+ # Uses the specified model which should be mistralai/Mistral-7B-Instruct-v0.2
35
+ llm = HuggingFaceHub(
36
+ repo_id=model_name,
37
+ huggingfacehub_api_token=hf_token,
38
+ # Adjust model parameters as needed, e.g., to match instruction-tuned requirements
39
+ model_kwargs={"temperature": 0.1, "max_length": 512}
40
+ )
41
+
42
+ # Simple test run to confirm connection
43
+ llm_test_response = llm("Say 'OK' and nothing else.")
44
+
45
+ if not llm_test_response or len(llm_test_response.strip()) < 2:
46
+ st.error(f"HuggingFace model responded with an empty or too short response during test: '{llm_test_response.strip()}'")
47
+ return None, None
48
+
49
+ st.sidebar.success(f"Hugging Face model **{model_name}** connected.")
50
+ except Exception as e:
51
+ st.error(f"Failed to initialize HuggingFace model '{model_name}'. Error: {e}")
52
+ return None, None
53
+
54
+ # 2. Create Pandas Agent
55
+ pandas_agent = create_pandas_dataframe_agent(
56
+ llm,
57
+ df,
58
+ verbose=True,
59
+ # Using ZERO_SHOT_REACT_DESCRIPTION which is generally well-supported
60
+ agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
61
+ allow_dangerous_code=True
62
+ )
63
+ return llm, pandas_agent
64
+
65
+
66
+ @st.cache_data(show_spinner=False)
67
+ def steps_eda(_llm) -> str:
68
+ """Queries the LLM for the key steps of EDA."""
69
+ st.info("Querying LLM for EDA steps...")
70
+ response = _llm("What are the key steps of Exploratory Data Analysis (EDA)? Provide a bulleted list.")
71
+ return response
72
+
73
+
74
+ @st.cache_data(show_spinner="Running initial EDA queries...")
75
+ def function_agent(_agent, _df) -> bool:
76
+ """Runs the initial automated EDA sequence."""
77
+ st.write("**Data Overview**")
78
+ st.write("The first rows of the dataset are:")
79
+ st.dataframe(_df.head(), use_container_width=True)
80
+
81
+ st.subheader("Data Cleaning")
82
+
83
+ columns_df = _agent.run("What are the columns in the dataset?")
84
+ st.write(f"The columns in the dataset are: {columns_df}")
85
+
86
+ missing_values = _agent.run("Are there any missing values in the dataset? List the columns and counts.")
87
+ st.write(f"Missing values in the dataset: {missing_values}")
88
+
89
+ duplicates = _agent.run("Are there any duplicate rows in the dataset? How many?")
90
+ st.write(f"Duplicate rows in the dataset: {duplicates}")
91
+
92
+ st.subheader("Statistical Summary")
93
+ st.dataframe(_df.describe(include='all'), use_container_width=True)
94
+
95
+ correlation_analysis = _agent.run("Calculate the correlation matrix between all numerical features and summarize the top 3 strongest correlations.")
96
+ st.write(correlation_analysis)
97
+
98
+ outliers_and_new_features = _agent.run("Identify any extreme outliers in the numerical features (use IQR method for top 3 columns). Also, are there any obvious new features that could be created from existing columns? Combine these answers.")
99
+ st.write(outliers_and_new_features)
100
+
101
+ return True