Spaces:

LeannJoy
/

DataAnalysisApp

Sleeping

App Files Files Community

DataAnalysisApp / utils.py

LeannJoy

Upload 5 files

ae15120 verified 6 months ago

raw

history blame

4.36 kB

	import streamlit as st
	import pandas as pd
	from langchain_community.llms import HuggingFaceHub # New Import for HuggingFace
	from langchain_experimental.agents import create_pandas_dataframe_agent
	from langchain.agents.agent_types import AgentType
	from typing import Tuple, Optional
	import os # New Import for environment variables/secrets


	# --- Utility Functions ---

	def handle_start_button_click(button_id: int):
	"""Function to update the clicked value in session state."""
	st.session_state.clicked[button_id] = True


	@st.cache_resource(show_spinner="Initializing LLM Agent...")
	def initialize_hf_agent(df: pd.DataFrame, model_name: str) -> Tuple[Optional[HuggingFaceHub], Optional[create_pandas_dataframe_agent]]:
	"""
	Initializes the HuggingFaceHub-based LangChain Pandas Agent.
	This function is cached to prevent repeated, slow LLM initialization.
	"""

	# Check for Hugging Face API Token
	# Checks st.secrets first, then environment variables
	hf_token = st.secrets.get("HUGGINGFACEHUB_API_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")

	if not hf_token:
	st.error("ERROR: HUGGINGFACEHUB_API_TOKEN is not found. Please set it in your environment variables or in your Streamlit secrets (`.streamlit/secrets.toml`).")
	return None, None

	# 1. Initialize HuggingFaceHub LLM
	try:
	# Uses the specified model which should be mistralai/Mistral-7B-Instruct-v0.2
	llm = HuggingFaceHub(
	repo_id=model_name,
	huggingfacehub_api_token=hf_token,
	# Adjust model parameters as needed, e.g., to match instruction-tuned requirements
	model_kwargs={"temperature": 0.1, "max_length": 512}
	)

	# Simple test run to confirm connection
	llm_test_response = llm("Say 'OK' and nothing else.")

	if not llm_test_response or len(llm_test_response.strip()) < 2:
	st.error(f"HuggingFace model responded with an empty or too short response during test: '{llm_test_response.strip()}'")
	return None, None

	st.sidebar.success(f"Hugging Face model {model_name} connected.")
	except Exception as e:
	st.error(f"Failed to initialize HuggingFace model '{model_name}'. Error: {e}")
	return None, None

	# 2. Create Pandas Agent
	pandas_agent = create_pandas_dataframe_agent(
	llm,
	df,
	verbose=True,
	# Using ZERO_SHOT_REACT_DESCRIPTION which is generally well-supported
	agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
	allow_dangerous_code=True
	)
	return llm, pandas_agent


	@st.cache_data(show_spinner=False)
	def steps_eda(_llm) -> str:
	"""Queries the LLM for the key steps of EDA."""
	st.info("Querying LLM for EDA steps...")
	response = _llm("What are the key steps of Exploratory Data Analysis (EDA)? Provide a bulleted list.")
	return response


	@st.cache_data(show_spinner="Running initial EDA queries...")
	def function_agent(_agent, _df) -> bool:
	"""Runs the initial automated EDA sequence."""
	st.write("Data Overview")
	st.write("The first rows of the dataset are:")
	st.dataframe(_df.head(), use_container_width=True)

	st.subheader("Data Cleaning")

	columns_df = _agent.run("What are the columns in the dataset?")
	st.write(f"The columns in the dataset are: {columns_df}")

	missing_values = _agent.run("Are there any missing values in the dataset? List the columns and counts.")
	st.write(f"Missing values in the dataset: {missing_values}")

	duplicates = _agent.run("Are there any duplicate rows in the dataset? How many?")
	st.write(f"Duplicate rows in the dataset: {duplicates}")

	st.subheader("Statistical Summary")
	st.dataframe(_df.describe(include='all'), use_container_width=True)

	correlation_analysis = _agent.run("Calculate the correlation matrix between all numerical features and summarize the top 3 strongest correlations.")
	st.write(correlation_analysis)

	outliers_and_new_features = _agent.run("Identify any extreme outliers in the numerical features (use IQR method for top 3 columns). Also, are there any obvious new features that could be created from existing columns? Combine these answers.")
	st.write(outliers_and_new_features)

	return True