Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,91 +1,125 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
import pandas as pd
|
| 3 |
-
|
| 4 |
-
import
|
|
|
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
)
|
| 12 |
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
st.
|
| 23 |
-
1. **Upload** your CSV file in the main area.
|
| 24 |
-
2. **Review** the automatic Exploratory Data Analysis (EDA) summary.
|
| 25 |
-
3. **Ask** your question in the text box. The system uses a LangChain Agent to run Python/Pandas code on your data and provide an answer.
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
- "Show a histogram of the 'Age' column."
|
| 30 |
-
- "Calculate the correlation matrix and identify the strongest relationship."
|
| 31 |
-
- "Perform a bivariate analysis of 'Gender' and 'Purchase_Amount'."
|
| 32 |
-
""")
|
| 33 |
-
st.warning("โ ๏ธ **Note on LLM:** This app uses the Gemini API via LangChain. For production use with a truly open-source LLM, you would need to replace the LLM initialization in `utils.py` with a compatible self-hosted or endpoint-based model.")
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
"
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
st.subheader("Data Preview")
|
| 54 |
-
st.dataframe(df.head())
|
| 55 |
-
|
| 56 |
-
# --- Automated EDA ---
|
| 57 |
-
with st.expander("๐ฌ Automated Exploratory Data Analysis (EDA)", expanded=False):
|
| 58 |
-
eda_summary = perform_eda(df)
|
| 59 |
-
st.markdown(eda_summary)
|
| 60 |
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
# Text input for the user's query
|
| 65 |
-
user_query = st.text_area(
|
| 66 |
-
"Enter your data question (e.g., 'Plot the distribution of Age' or 'What is the median salary?'):",
|
| 67 |
-
placeholder="Ask me anything about your data...",
|
| 68 |
-
key="user_query_input"
|
| 69 |
-
)
|
| 70 |
-
|
| 71 |
-
# Button to trigger the agent
|
| 72 |
-
if st.button("Run Analysis", use_container_width=True, type="primary") and user_query:
|
| 73 |
-
with st.spinner(f"Analyzing data with LangChain Agent..."):
|
| 74 |
-
# Call the agent function from utils.py
|
| 75 |
-
agent_response = query_agent(df, user_query)
|
| 76 |
-
|
| 77 |
-
st.success("Analysis Complete!")
|
| 78 |
-
st.markdown("---")
|
| 79 |
-
st.subheader("Agent Response")
|
| 80 |
-
# Display the response
|
| 81 |
-
st.info(agent_response)
|
| 82 |
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
-
|
| 91 |
-
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import pandas as pd
|
| 3 |
+
# Import the helper functions, including the new guide generator
|
| 4 |
+
from utils import perform_eda, query_agent, generate_eda_guide
|
| 5 |
+
import os
|
| 6 |
|
| 7 |
+
st.set_page_config(layout="wide", page_title="NL Data Analyst Agent")
|
| 8 |
+
|
| 9 |
+
# --- Streamlit UI Setup ---
|
| 10 |
+
st.title("๐ Natural Language Data Analyst")
|
| 11 |
+
st.markdown("Upload a CSV file and ask the Mistral-powered agent questions about your data.")
|
| 12 |
+
|
| 13 |
+
# --- Sidebar for Instructions and LLM-Generated EDA Steps ---
|
| 14 |
+
st.sidebar.header("Instructions")
|
| 15 |
+
st.sidebar.markdown(
|
| 16 |
+
"""
|
| 17 |
+
1. **Upload** your CSV file below.
|
| 18 |
+
2. The application will immediately run a **preliminary EDA**.
|
| 19 |
+
3. Use the **Query Box** in the main area to ask detailed questions about your data.
|
| 20 |
+
"""
|
| 21 |
)
|
| 22 |
|
| 23 |
+
# Use caching so the LLM guide is only generated once at startup
|
| 24 |
+
@st.cache_data(show_spinner="Generating LLM-Powered EDA Guide...")
|
| 25 |
+
def get_eda_guide():
|
| 26 |
+
"""Wrapper function to cache the result of the LLM guide generation."""
|
| 27 |
+
try:
|
| 28 |
+
return generate_eda_guide()
|
| 29 |
+
except Exception:
|
| 30 |
+
return "Could not generate guide. Check LLM setup."
|
| 31 |
+
|
| 32 |
+
# Put the LLM-generated guide inside an expander
|
| 33 |
+
with st.sidebar.expander("๐ **LLM-Powered EDA Guide**", expanded=True):
|
| 34 |
+
guide_content = get_eda_guide()
|
| 35 |
+
st.markdown(guide_content)
|
| 36 |
+
|
| 37 |
+
st.sidebar.info("Remember: The agent executes Python code. Be clear and specific with your requests, especially for plotting and complex analysis.")
|
| 38 |
+
|
| 39 |
+
# --- File Uploader ---
|
| 40 |
+
uploaded_file = st.file_uploader("Upload CSV File", type="csv")
|
| 41 |
+
|
| 42 |
+
if uploaded_file is not None:
|
| 43 |
+
# Read the file and store it in session state
|
| 44 |
+
@st.cache_data
|
| 45 |
+
def load_data(file):
|
| 46 |
+
try:
|
| 47 |
+
# FIX: Changed delimiter to semicolon (;) and skip bad lines
|
| 48 |
+
df = pd.read_csv(file, sep=';', on_bad_lines='skip')
|
| 49 |
+
return df
|
| 50 |
+
except Exception as e:
|
| 51 |
+
st.error(f"Error reading CSV: {e}")
|
| 52 |
+
st.error("Try checking the file delimiter or encoding.")
|
| 53 |
+
return None
|
| 54 |
|
| 55 |
+
df = load_data(uploaded_file)
|
| 56 |
+
|
| 57 |
+
if df is not None:
|
| 58 |
+
st.session_state.df = df
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
+
st.subheader("Uploaded Data Preview")
|
| 61 |
+
st.dataframe(df.head())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
+
# Perform and display the initial EDA
|
| 64 |
+
eda_markdown = perform_eda(df)
|
| 65 |
+
with st.expander("Preliminary EDA Results (Click to View Structure, Types, and Quality Checks)", expanded=True):
|
| 66 |
+
st.markdown(eda_markdown)
|
| 67 |
+
|
| 68 |
+
# --- Query Interface ---
|
| 69 |
+
st.subheader("Ask Your Data Analyst Agent")
|
| 70 |
+
query = st.text_area(
|
| 71 |
+
"Enter your natural language query:",
|
| 72 |
+
placeholder="e.g., What are the mean and standard deviation of the 'salary' column? Or: Show me a scatter plot of 'age' vs 'income'."
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
if st.button("Analyze Data"):
|
| 76 |
+
if query:
|
| 77 |
+
response = ""
|
| 78 |
+
with st.spinner("Agent is analyzing the data..."):
|
| 79 |
+
# Call the LangChain agent
|
| 80 |
+
response = query_agent(df, query)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
+
st.markdown("---")
|
| 83 |
+
st.subheader("Agent Response")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
+
# --- NEW LOGIC FOR FORMATTING OUTPUT ---
|
| 86 |
+
|
| 87 |
+
# 1. Check if the response contains a plot (Streamlit handles plot display automatically)
|
| 88 |
+
if "plot" in query.lower() or "chart" in query.lower() or "graph" in query.lower():
|
| 89 |
+
st.success("Plot generated successfully (displayed below).")
|
| 90 |
+
st.text_area("Agent's Explanation:", value=response, height=100)
|
| 91 |
+
|
| 92 |
+
else:
|
| 93 |
+
# 2. Try to parse the response as a DataFrame for clean display
|
| 94 |
+
is_dataframe = False
|
| 95 |
+
|
| 96 |
+
# Common Pandas string representations start with indices or alignment markers
|
| 97 |
+
if response.strip().startswith((' ', '0', '1', '2', 'Index', 'Name')) or '\n' in response:
|
| 98 |
+
try:
|
| 99 |
+
# Use StringIO to read the response string as if it were a file
|
| 100 |
+
from io import StringIO
|
| 101 |
+
temp_df = pd.read_csv(StringIO(response.strip()), sep='\s\s+', engine='python')
|
| 102 |
+
|
| 103 |
+
# A successful parse results in a table, but it might not be the intended data.
|
| 104 |
+
# We check if the parsed table has a reasonable number of columns (> 1)
|
| 105 |
+
if len(temp_df.columns) > 1:
|
| 106 |
+
st.dataframe(temp_df)
|
| 107 |
+
is_dataframe = True
|
| 108 |
+
|
| 109 |
+
except Exception:
|
| 110 |
+
# If parsing fails, it's not a clean DataFrame output, so treat it as text.
|
| 111 |
+
pass
|
| 112 |
+
|
| 113 |
+
# 3. If it wasn't a plot and we couldn't parse a clean DataFrame, display as informational text.
|
| 114 |
+
if not is_dataframe:
|
| 115 |
+
st.info(response)
|
| 116 |
+
|
| 117 |
+
else:
|
| 118 |
+
st.warning("Please enter a query to analyze the data.")
|
| 119 |
+
|
| 120 |
+
else:
|
| 121 |
+
st.info("Awaiting CSV file upload.")
|
| 122 |
|
| 123 |
+
# Add a note about the required API key for deployment
|
| 124 |
+
if 'HUGGINGFACEHUB_API_TOKEN' not in st.session_state and 'HUGGINGFACEHUB_API_TOKEN' not in os.environ:
|
| 125 |
+
st.sidebar.error("HuggingFace API Token is missing. Set the `HUGGINGFACEHUB_API_TOKEN` environment variable or Streamlit Secret.")
|