Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| from langchain_community.llms import HuggingFaceHub # New Import for HuggingFace | |
| from langchain_experimental.agents import create_pandas_dataframe_agent | |
| from langchain.agents.agent_types import AgentType | |
| from typing import Tuple, Optional | |
| import os # New Import for environment variables/secrets | |
| # --- Utility Functions --- | |
| def handle_start_button_click(button_id: int): | |
| """Function to update the clicked value in session state.""" | |
| st.session_state.clicked[button_id] = True | |
| def initialize_hf_agent(df: pd.DataFrame, model_name: str) -> Tuple[Optional[HuggingFaceHub], Optional[create_pandas_dataframe_agent]]: | |
| """ | |
| Initializes the HuggingFaceHub-based LangChain Pandas Agent. | |
| This function is cached to prevent repeated, slow LLM initialization. | |
| """ | |
| # Check for Hugging Face API Token | |
| # Checks st.secrets first, then environment variables | |
| hf_token = st.secrets.get("HUGGINGFACEHUB_API_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN") | |
| if not hf_token: | |
| st.error("ERROR: **HUGGINGFACEHUB_API_TOKEN** is not found. Please set it in your environment variables or in your Streamlit secrets (`.streamlit/secrets.toml`).") | |
| return None, None | |
| # 1. Initialize HuggingFaceHub LLM | |
| try: | |
| # Uses the specified model which should be mistralai/Mistral-7B-Instruct-v0.2 | |
| llm = HuggingFaceHub( | |
| repo_id=model_name, | |
| huggingfacehub_api_token=hf_token, | |
| # Adjust model parameters as needed, e.g., to match instruction-tuned requirements | |
| model_kwargs={"temperature": 0.1, "max_length": 512} | |
| ) | |
| # Simple test run to confirm connection | |
| llm_test_response = llm("Say 'OK' and nothing else.") | |
| if not llm_test_response or len(llm_test_response.strip()) < 2: | |
| st.error(f"HuggingFace model responded with an empty or too short response during test: '{llm_test_response.strip()}'") | |
| return None, None | |
| st.sidebar.success(f"Hugging Face model **{model_name}** connected.") | |
| except Exception as e: | |
| st.error(f"Failed to initialize HuggingFace model '{model_name}'. Error: {e}") | |
| return None, None | |
| # 2. Create Pandas Agent | |
| pandas_agent = create_pandas_dataframe_agent( | |
| llm, | |
| df, | |
| verbose=True, | |
| # Using ZERO_SHOT_REACT_DESCRIPTION which is generally well-supported | |
| agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION, | |
| allow_dangerous_code=True | |
| ) | |
| return llm, pandas_agent | |
| def steps_eda(_llm) -> str: | |
| """Queries the LLM for the key steps of EDA.""" | |
| st.info("Querying LLM for EDA steps...") | |
| response = _llm("What are the key steps of Exploratory Data Analysis (EDA)? Provide a bulleted list.") | |
| return response | |
| def function_agent(_agent, _df) -> bool: | |
| """Runs the initial automated EDA sequence.""" | |
| st.write("**Data Overview**") | |
| st.write("The first rows of the dataset are:") | |
| st.dataframe(_df.head(), use_container_width=True) | |
| st.subheader("Data Cleaning") | |
| columns_df = _agent.run("What are the columns in the dataset?") | |
| st.write(f"The columns in the dataset are: {columns_df}") | |
| missing_values = _agent.run("Are there any missing values in the dataset? List the columns and counts.") | |
| st.write(f"Missing values in the dataset: {missing_values}") | |
| duplicates = _agent.run("Are there any duplicate rows in the dataset? How many?") | |
| st.write(f"Duplicate rows in the dataset: {duplicates}") | |
| st.subheader("Statistical Summary") | |
| st.dataframe(_df.describe(include='all'), use_container_width=True) | |
| correlation_analysis = _agent.run("Calculate the correlation matrix between all numerical features and summarize the top 3 strongest correlations.") | |
| st.write(correlation_analysis) | |
| outliers_and_new_features = _agent.run("Identify any extreme outliers in the numerical features (use IQR method for top 3 columns). Also, are there any obvious new features that could be created from existing columns? Combine these answers.") | |
| st.write(outliers_and_new_features) | |
| return True |