DataAnalysisApp / utils.py
LeannJoy's picture
Upload 5 files
ae15120 verified
raw
history blame
4.36 kB
import streamlit as st
import pandas as pd
from langchain_community.llms import HuggingFaceHub # New Import for HuggingFace
from langchain_experimental.agents import create_pandas_dataframe_agent
from langchain.agents.agent_types import AgentType
from typing import Tuple, Optional
import os # New Import for environment variables/secrets
# --- Utility Functions ---
def handle_start_button_click(button_id: int):
"""Function to update the clicked value in session state."""
st.session_state.clicked[button_id] = True
@st.cache_resource(show_spinner="Initializing LLM Agent...")
def initialize_hf_agent(df: pd.DataFrame, model_name: str) -> Tuple[Optional[HuggingFaceHub], Optional[create_pandas_dataframe_agent]]:
"""
Initializes the HuggingFaceHub-based LangChain Pandas Agent.
This function is cached to prevent repeated, slow LLM initialization.
"""
# Check for Hugging Face API Token
# Checks st.secrets first, then environment variables
hf_token = st.secrets.get("HUGGINGFACEHUB_API_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
if not hf_token:
st.error("ERROR: **HUGGINGFACEHUB_API_TOKEN** is not found. Please set it in your environment variables or in your Streamlit secrets (`.streamlit/secrets.toml`).")
return None, None
# 1. Initialize HuggingFaceHub LLM
try:
# Uses the specified model which should be mistralai/Mistral-7B-Instruct-v0.2
llm = HuggingFaceHub(
repo_id=model_name,
huggingfacehub_api_token=hf_token,
# Adjust model parameters as needed, e.g., to match instruction-tuned requirements
model_kwargs={"temperature": 0.1, "max_length": 512}
)
# Simple test run to confirm connection
llm_test_response = llm("Say 'OK' and nothing else.")
if not llm_test_response or len(llm_test_response.strip()) < 2:
st.error(f"HuggingFace model responded with an empty or too short response during test: '{llm_test_response.strip()}'")
return None, None
st.sidebar.success(f"Hugging Face model **{model_name}** connected.")
except Exception as e:
st.error(f"Failed to initialize HuggingFace model '{model_name}'. Error: {e}")
return None, None
# 2. Create Pandas Agent
pandas_agent = create_pandas_dataframe_agent(
llm,
df,
verbose=True,
# Using ZERO_SHOT_REACT_DESCRIPTION which is generally well-supported
agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
allow_dangerous_code=True
)
return llm, pandas_agent
@st.cache_data(show_spinner=False)
def steps_eda(_llm) -> str:
"""Queries the LLM for the key steps of EDA."""
st.info("Querying LLM for EDA steps...")
response = _llm("What are the key steps of Exploratory Data Analysis (EDA)? Provide a bulleted list.")
return response
@st.cache_data(show_spinner="Running initial EDA queries...")
def function_agent(_agent, _df) -> bool:
"""Runs the initial automated EDA sequence."""
st.write("**Data Overview**")
st.write("The first rows of the dataset are:")
st.dataframe(_df.head(), use_container_width=True)
st.subheader("Data Cleaning")
columns_df = _agent.run("What are the columns in the dataset?")
st.write(f"The columns in the dataset are: {columns_df}")
missing_values = _agent.run("Are there any missing values in the dataset? List the columns and counts.")
st.write(f"Missing values in the dataset: {missing_values}")
duplicates = _agent.run("Are there any duplicate rows in the dataset? How many?")
st.write(f"Duplicate rows in the dataset: {duplicates}")
st.subheader("Statistical Summary")
st.dataframe(_df.describe(include='all'), use_container_width=True)
correlation_analysis = _agent.run("Calculate the correlation matrix between all numerical features and summarize the top 3 strongest correlations.")
st.write(correlation_analysis)
outliers_and_new_features = _agent.run("Identify any extreme outliers in the numerical features (use IQR method for top 3 columns). Also, are there any obvious new features that could be created from existing columns? Combine these answers.")
st.write(outliers_and_new_features)
return True