Spaces:

LeannJoy
/

DataAnalysisApp

Sleeping

App Files Files Community

DataAnalysisApp / app.py

LeannJoy

Upload 5 files

ae15120 verified 6 months ago

raw

history blame

6.2 kB

	import streamlit as st
	import pandas as pd
	import time

	# --- Import Utilities ---
	# IMPORTANT: Both utils.py and app.py MUST be in the same directory for this import to work.
	try:
	from utils import initialize_hf_agent, steps_eda, function_agent, handle_start_button_click
	except ImportError:
	st.error("ERROR: Could not import 'utils.py'. Please ensure 'utils.py' is in the same folder as 'app.py'.")
	st.stop()


	# --- Configuration Constants ---
	# Available LLM Models for selection (Now only the selected HuggingFace model)
	HUGGINGFACE_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"


	# --- Streamlit UI Setup ---
	st.set_page_config(layout="wide", page_title="AI Data Science Assistant")
	st.title("AI Assistant for Data Science 🤖")
	st.write("Hello, 👋 I am your AI assistant and I am here to help you with your data science projects.")

	# --- Sidebar: Model Selection and Explanation ---
	with st.sidebar:
	st.write("Your Data Science Adventure Begins with an CSV File. ")
	st.caption("You may already know that every exciting data science journey starts with a CSV file. Upload your CSV file to get started!")

	st.divider()

	# Model Information Expander
	with st.expander("LLM Engine Details"):
	st.info("Using Hugging Face Hub as the LLM Engine.")
	st.write(f"Model: `{HUGGINGFACE_MODEL}`")
	st.caption("This requires the HUGGINGFACEHUB_API_TOKEN to be set in your Streamlit secrets or environment variables.")

	# Store the single model in session state for consistency with the original structure
	st.session_state.selected_model = HUGGINGFACE_MODEL

	st.divider()
	st.caption("Developed by [Your Name]")


	# --- Session State Initialization ---

	# Initialize the key in session state for the start button
	if "clicked" not in st.session_state:
	st.session_state.clicked = {1: False}
	if 'df' not in st.session_state:
	st.session_state.df = None
	if 'pandas_agent' not in st.session_state:
	st.session_state.pandas_agent = None
	if 'llm' not in st.session_state:
	st.session_state.llm = None
	if 'selected_model' not in st.session_state:
	st.session_state.selected_model = HUGGINGFACE_MODEL


	# The button calls the imported utility function
	st.button("Let's Get Started!", on_click=handle_start_button_click, args=[1])


	# --- Main Application Logic ---
	if st.session_state.clicked[1]:
	user_csv = st.file_uploader("Upload your CSV file", type="csv")

	if user_csv is not None:

	# 1. Load Data
	user_csv.seek(0)
	df = pd.read_csv(user_csv, low_memory=False)
	st.session_state.df = df

	# 2. Initialize Agent (uses imported utility function)
	# Note: The function name is changed to initialize_hf_agent
	st.session_state.llm, st.session_state.pandas_agent = initialize_hf_agent(df, st.session_state.selected_model)

	# Check if agent is successfully initialized
	if st.session_state.pandas_agent is None:
	# The error message is already shown in utils.py on failure
	st.stop()

	# --- Main Execution Flow ---

	st.header("Exploratory Data Analysis (EDA)")

	# Display EDA Steps in Expander (uses imported utility function)
	with st.sidebar:
	with st.expander("What are the steps of EDA?"):
	# Note: The llm object is passed to steps_eda
	steps_text = steps_eda(st.session_state.llm)
	st.markdown(steps_text)

	# Run the initial, automated EDA sequence (uses imported utility function)
	function_agent(st.session_state.pandas_agent, st.session_state.df)


	st.divider()
	st.subheader("Variable of Study")
	user_question_variable = st.text_input("What variable would you like to analyze (e.g., 'price') and what question do you have about it? (e.g., 'What is the distribution of age?')")

	if user_question_variable:
	st.info(f"Analyzing variable: {user_question_variable}")

	# Efficient combined prompt for analysis
	variable_analysis_prompt = (
	f"Analyze the variable {user_question_variable}. "
	f"Specifically, provide summary statistics (mean, median, mode, quartiles), "
	f"identify any outliers, and check for missing values. "
	f"Also, use your plotting tool to create a histogram or a box plot for this variable. "
	f"Output the plot code separately."
	)

	with st.spinner("Running deep variable analysis..."):
	start_time = time.time()
	response = st.session_state.pandas_agent.run(variable_analysis_prompt)
	end_time = time.time()

	st.write(response)

	st.info(f"Analysis Time: {end_time - start_time:.2f} seconds")
	st.warning("The agent will output Python code for plots. You would need to manually execute this code to visualize it.")


	st.divider()
	st.subheader("Further Study")

	user_question_dataframe = st.text_input("Do you have any other final questions about the dataset or need a complex visualization? (e.g., 'What is the correlation between age and salary?')")

	if user_question_dataframe:
	st.info(f"Final question: {user_question_dataframe}")

	with st.spinner("Running final analysis..."):
	final_response = st.session_state.pandas_agent.run(user_question_dataframe)

	st.write(final_response)

	st.divider()
	st.header("Data Science Problem")
	st.write("Now that we have a solid grasp of the data at hand and a clear understanding of the variables we intend to investigate, it's time to define the specific data science problem we aim to solve. This step is crucial as it sets the direction for our analysis and helps us determine the appropriate methodologies and techniques to employ.")