Spaces:

sanjaystarc
/

data-analysis-agent

Sleeping

App Files Files Community

data-analysis-agent / app.py

sanjaystarc

Update app.py

3a72d72 verified 4 months ago

raw

history blame contribute delete

8.3 kB

	import os
	import streamlit as st
	import pandas as pd
	import numpy as np
	import requests
	import json
	import time
	import matplotlib.pyplot as plt
	import seaborn as sns

	# --- CONFIG ---
	# Note: GEMINI_API_KEY is retrieved from environment variables/secrets.
	GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

	if not GEMINI_API_KEY:
	st.error("❌ Missing Gemini API key. Add it as a secret: GEMINI_API_KEY")
	st.stop()

	# Define API endpoints and models
	GEMINI_BASE = "https://generativelanguage.googleapis.com/v1beta"
	# Using the correct model for structured output
	CHAT_MODEL = "gemini-2.5-flash-preview-09-2025"
	EMBED_MODEL = "models/embedding-001"

	# Define the JSON schema for structured output
	ANALYSIS_SCHEMA = {
	"type": "OBJECT",
	"properties": {
	"reasoning": {
	"type": "STRING",
	"description": "A detailed natural language explanation of the analysis, including key findings and context."
	},
	"code": {
	"type": "STRING",
	"description": "The complete, runnable Python code using pandas (df) and streamlit (st). Use st.pyplot() for plots, and st.dataframe() for resulting DataFrames. If no code is needed, this should be an empty string."
	}
	}
	}

	SYSTEM_INSTRUCTION = (
	"You are a world-class Data Analyst Agent. Your task is to analyze the provided DataFrame ('df') "
	"based on the user's question. You MUST respond with a single JSON object conforming to the provided schema. "
	"1. Reasoning: Explain your plan, the steps taken, and the insights derived from the data. Format this in Markdown. "
	"2. Code: If the question requires calculation, aggregation, or visualization, you MUST generate Python code to execute against the 'df' DataFrame. "
	" - The DataFrame is already loaded as a variable named 'df'. Do NOT redefine it. "
	" - Use Streamlit functions for simple outputs: `st.dataframe(...)`, `st.bar_chart()`, `st.line_chart()`. "
	" - For ALL custom, complex plots, you MUST follow this strict Matplotlib sequence: Start with `plt.figure()`, use `plt.` or `sns.` commands for plotting, and explicitly end with `st.pyplot(plt)` to display the output. "
	" - CRITICAL GUARDRAIL: When generating code that uses logical conditions (e.g., in `if` statements or for complex filters) on Pandas Series or NumPy arrays, you MUST resolve ambiguity by using `.any()` or `.all()`. Do NOT compare a series directly to a single boolean value."
	" - Ensure the code is self-contained and ready to execute."
	)

	# --- Helper Functions ---

	def chat_with_gemini(prompt, context):
	"""Sends a prompt and data context to the Gemini model for structured analysis (reasoning + code)."""

	# Correctly prepend 'models/' to the model name in the URL path
	url = f"{GEMINI_BASE}/models/{CHAT_MODEL}:generateContent?key={GEMINI_API_KEY}"

	# Construct the full prompt including the data context
	full_prompt = f"Data Context (DataFrame Head and Columns):\n{context}\n\nUser Question: {prompt}"

	payload = {
	"contents": [
	{"parts": [{"text": full_prompt}]}
	],
	"systemInstruction": {"parts": [{"text": SYSTEM_INSTRUCTION}]},
	"generationConfig": {
	"responseMimeType": "application/json",
	"responseSchema": ANALYSIS_SCHEMA
	}
	}

	max_retries = 5
	delay = 1
	for attempt in range(max_retries):
	try:
	r = requests.post(url, headers={'Content-Type': 'application/json'}, data=json.dumps(payload))
	r.raise_for_status()
	data = r.json()

	json_str = data["candidates"][0]["content"]["parts"][0]["text"]
	return json.loads(json_str)

	except requests.exceptions.RequestException as e:
	if attempt < max_retries - 1:
	time.sleep(delay)
	delay *= 2
	else:
	st.error(f"API Request Failed: {e}")
	raise e
	except Exception as e:
	st.error(f"Failed to parse model response or execute operation: {e}")
	raise e

	# --- UI ---
	st.title("✨Data Analyst Agent (Code Execution Enabled)")
	st.write("Upload a CSV file and ask natural language questions. The agent now generates and executes Python code to provide precise data analysis and visualizations.")

	# State variable to hold the DataFrame, initialized once
	if 'df' not in st.session_state:
	st.session_state.df = pd.DataFrame()

	uploaded = st.file_uploader("Upload CSV", type=["csv"])

	if uploaded:
	# Use st.cache_data to avoid reloading the file multiple times
	@st.cache_data
	def load_data(file):
	try:
	return pd.read_csv(file)
	except Exception as e:
	st.error(f"Failed to load CSV: {e}")
	return pd.DataFrame()

	st.session_state.df = load_data(uploaded)

	if not st.session_state.df.empty:
	st.subheader("Data Preview (First 5 Rows)")
	st.dataframe(st.session_state.df.head())

	question = st.text_area("Ask a complex question or request a visualization (e.g., 'Show the average of the 'Sales' column', 'Plot the distribution of 'Age'):")

	if st.button("Analyze & Execute") and question:
	df = st.session_state.df # Local variable for code execution context

	# Summarize dataset for context sent to the LLM
	context = f"Dataset Columns: {', '.join(df.columns.astype(str))}\n\nFirst 5 rows of data:\n{df.head(5).to_string(index=False)}"

	st.markdown("---")
	st.subheader("🤖 Analysis Steps")

	with st.spinner("1. Generating analysis plan and code..."):
	try:
	# 1. Get structured response from LLM
	analysis_result = chat_with_gemini(question, context)

	reasoning = analysis_result.get('reasoning', "No reasoning provided.")
	code = analysis_result.get('code', "")

	st.markdown("#### 💬 Reasoning:")
	st.markdown(reasoning)

	st.markdown("#### 🐍 Generated Code:")
	st.code(code, language='python')

	except Exception as e:
	st.error(f"Step 1 Failed (LLM Interaction): {e}")
	reasoning = ""
	code = ""

	if code:
	with st.spinner("2. Executing code and generating output..."):
	try:
	# 2. Execute the generated Python code safely

	# IMPORTANT: Create a local scope with necessary variables
	local_scope = {
	'df': df,
	'st': st,
	'pd': pd,
	'np': np,
	'plt': plt,
	'sns': sns,
	}

	# Append a neutral statement to the code to prevent implicit Streamlit display of the last value
	final_code = code + "\nNone"

	# Executing the code within the local scope
	exec(final_code, globals(), local_scope)

	# FIX: Explicitly close all Matplotlib figures to prevent cross-run contamination
	plt.close('all')

	st.success("Code execution complete. Results are displayed above.")

	except Exception as e:
	st.error(f"Step 2 Failed (Code Execution Error): The agent generated invalid code. Check the console for full traceback.")
	st.exception(e)
	else:
	st.info("No code was generated, as the question was purely informational.")
	else:
	st.info("The uploaded CSV file appears to be empty.")

	else:
	st.info("👆 Upload a CSV file to begin the full analysis experience.")