Spaces:

jsds003
/

AnalyticsAgent

Paused

App Files Files Community

AnalyticsAgent / src /streamlit_app.py

jsds003

Added the application's contents to the initial commit

27a10b6 6 months ago

raw

history blame

12 kB

	import pandas as pd
	from transformers import pipeline
	import streamlit as st
	from pygwalker.api.streamlit import StreamlitRenderer
	import re
	from typing import List, Any

	@st.cache_resource
	def getPipeline():
	return pipeline("text-generation", model="nvidia/Llama-3.1-Nemotron-Nano-4B-v1.1")


	@st.cache_resource
	def get_pyg_renderer(df: pd.DataFrame):
	return StreamlitRenderer(st.session_state.df)

	pipe = getPipeline()

	def FileSummaryHelper(df: pd.DataFrame) -> str:
	"""Gathers basiline information about the dataset"""

	colSummaries = []

	for col in df:
	colSummaries.append(f"'{col}' \| Data Type: {df[col].dtype} \| Missing Percentage: {df[col].isna().mean()*100:.2f}%")
	colTypesAndNulls = "\n".join(colSummaries)

	duplicateVals = df.duplicated(keep=False).sum()
	totalVals = len(df)

	return f"""
	The columns of the data have the following datatypes and missing value percentages:
	{colTypesAndNulls}

	The dataset has {totalVals} total rows.

	The dataset has {duplicateVals} duplicated rows.
	"""

	def FileDescriptionAgent(userDesc:str, df: pd.DataFrame) -> str:
	"""Generates a description of the contents of the file based on initial analysis."""

	userDesc = "" if not userDesc else "I have described the dataset as follows: " + userDesc
	fileSummary = FileSummaryHelper(df)

	prompt = f""" You are given a DataFrame `df` with columns: {', '.join(df.columns.tolist())}
	{fileSummary}
	{userDesc}

	Qualitatively describe the dataset in 2-3 concise sentences. Your response must only include the description with no explanations before or after."""

	messages = [
	{"role": "system", "content": \
	"detailed thinking off. You are an insightful Data Analyst."},
	{"role": "user","content":prompt}
	]

	response = pipe(messages, temperature = 0.2, max_new_tokens = 1024, return_full_text=False)[0]['generated_text']

	return response

	def AnlaysisQuestionAgent(summary:str):

	messages = [
	{"role": "system", "content": \
	"""detailed thinking off. You are an inquisitive Data Analyst.
	Given the following summary of a dataset, create a list of 3 analytical questions, following these rules:

	Rules
	-----
	1. The questions must be answerable through simple Pandas operations with only the given data.
	2. Your response must only include the three questions in a numbered list. Do not include explanations or caveats before or after.
	3. Ensure the output list is formated: 1. question1, 2. question2, 3. question3
	"""},
	{"role":"user","content":summary}
	]

	response = pipe(messages, temperature = 0.2, max_new_tokens = 1024, return_full_text=False)[0]['generated_text']

	parts = re.split(r'\d+\.\s*', response)

	result = [p.strip() for p in parts if p]

	return result

	def CodeGeneratorTool(cols: List[str], query: str) -> str:
	"""Generate a prompt for the LLM to write pandas-only code for a data query (no plotting)."""

	return f"""
	Given DataFrame `df` with columns: {', '.join(cols)}
	Write Python code (pandas only, no plotting) to answer:
	"{query}"

	Rules
	-----
	1. Use pandas operations on `df` only.
	2. Assign the final result to `result`.
	3. Wrap the snippet in a single ```python code fence (no extra prose).
	"""

	def CodeExecutionHelper(code: str, df: pd.DataFrame):
	"""Executes the generated code, returning the result or error"""

	env = {"pd": pd, "df": df}
	try:
	exec(code, {}, env)
	return env.get("result", None)
	except Exception as exc:
	return f"Error executing code: {exc}"

	def CodeExtractorHelper(text: str) -> str:
	"""Extracts the first python code block from the output"""

	start = text.find("```python")
	if start == -1:
	return ""
	start += len("```python")
	end = text.find("```", start)
	if end == -1:
	return ""
	return text[start:end].strip()

	def ToolSelectorAgent(query: str, df: pd.DataFrame):
	"""Selects the appropriate tool for the users query"""

	prompt = CodeGeneratorTool(df.columns.tolist(), query)

	messages = [
	{"role": "system", "content": \
	"detailed thinking off. You are a Python data-analysis expert who writes clean, efficient code. \
	Solve the given problem with optimal pandas operations. Be concise and focused. \
	Your response must contain ONLY a properly-closed ```python code block with no explanations before or after. \
	Ensure your solution is correct, handles edge cases, and follows best practices for data analysis."},
	{"role": "user", "content": prompt}
	]

	response = pipe(messages, temperature = 0.2, max_new_tokens = 1024, return_full_text=False)[0]['generated_text']
	return CodeExtractorHelper(response)

	def ReasoningPromptGenerator(query: str, result: Any) -> str:
	"""Packages the output into a response, provinding reasoning about the result."""

	isError = isinstance(result, str) and result.startswith("Error executing code")

	if isError:
	desc = result
	else:
	desc = str(result)[:300] #why slice it

	prompt = f"""
	The user asked: "{query}".
	The result value is: {desc}
	Explain in 2-3 concise sentences what this tells about the data (no mention of charts)."""
	return prompt

	def ReasoningAgent(query: str, result: Any):
	"""Executes the reasoning prompt and returns the results and explination to the user"""

	prompt = ReasoningPromptGenerator(query, result)
	isError = isinstance(result, str) and result.startswith("Error executing code")

	messages = [
	{"role": "system", "content": \
	"detailed thinking on. You are an insightful data analyst"},
	{"role": "user","content": prompt}

	]

	response = pipe(messages, temperature = 0.2, max_new_tokens = 1024, return_full_text=False)[0]['generated_text']
	if "</think>" in response:
	splitResponse = response.split("</think>",1)
	response = splitResponse[1]
	thinking = splitResponse[0]
	return response, thinking

	def ResponseBuilderTool(question:str)->str:
	code = ToolSelectorAgent(question, st.session_state.df)
	result = CodeExecutionHelper(code, st.session_state.df)
	reasoning_txt, raw_thinking = ReasoningAgent(question, result)
	reasoning_txt = reasoning_txt.replace("`", "")

	# Build assistant response

	if isinstance(result, (pd.DataFrame, pd.Series)):
	header = f"Result: {len(result)} rows" if isinstance(result, pd.DataFrame) else "Result series"
	else:
	header = f"Result: {result}"

	# Show only reasoning thinking in Model Thinking (collapsed by default)
	thinking_html = ""
	if raw_thinking:
	thinking_html = (
	'<details class="thinking">'
	'<summary>🧠 Reasoning</summary>'
	f'<pre>{raw_thinking}</pre>'
	'</details>'
	)

	# Code accordion with proper HTML <pre><code> syntax highlighting
	code_html = (
	'<details class="code">'
	'<summary>View code</summary>'
	'<pre><code class="language-python">'
	f'{code}'
	'</code></pre>'
	'</details>'
	)

	# Combine thinking, explanation, and code accordion
	return f"{header}\n\n{thinking_html}{reasoning_txt}\n\n{code_html}"


	def main():
	"""Streamlit App"""

	st.set_page_config(layout="wide")
	st.title("Analytics Agent")

	file = st.file_uploader("Choose CSV", type=["csv"])

	if file:
	if("df" not in st.session_state) or (st.session_state.get("current_file") != file.name):
	st.session_state.df = pd.read_csv(file)
	st.session_state.current_file = file.name
	with st.spinner("Summarizing..."):
	st.session_state.file_summary = FileDescriptionAgent("",st.session_state.df)
	st.markdown("### Data Summary:")
	st.text(st.session_state.file_summary)

	pygApp = get_pyg_renderer(st.session_state.df)
	pygApp.explorer(default_tab="data")

	st.markdown(
	"""
	<style>
	section[data-testid="stSidebar"] {
	width: 500px !important; # Set the width to your desired value
	}
	</style>
	""",
	unsafe_allow_html=True,
	)

	with st.sidebar:
	st.markdown("## Analysis Discussion:")

	if("first_question" not in st.session_state):
	st.session_state.first_question = ""

	if("num_question_asked" not in st.session_state):
	st.session_state.num_question_asked = 0

	if("messages" not in st.session_state):
	st.session_state.messages = []

	if st.session_state.num_question_asked == 0:
	with st.spinner("Preparing Anlaysis..."):
	if("analsyis_questions" not in st.session_state):
	st.session_state.analsyis_questions = AnlaysisQuestionAgent(st.session_state.file_summary)

	with st.container():
	if q1:= st.button(st.session_state.analsyis_questions[0]):
	st.session_state.first_question = st.session_state.analsyis_questions[0]
	if q2:= st.button(st.session_state.analsyis_questions[1]):
	st.session_state.first_question = st.session_state.analsyis_questions[1]
	if q3:= st.button(st.session_state.analsyis_questions[2]):
	st.session_state.first_question = st.session_state.analsyis_questions[2]

	chat = st.chat_input("Something else...")
	if chat:
	st.session_state.first_question = chat

	st.session_state.num_question_asked += 1 if(q1 or q2 or q3 or chat is not None) else 0
	if st.session_state.num_question_asked == 1:
	st.session_state.messages.append({"role": "user", "content": st.session_state.first_question})
	st.rerun()

	elif st.session_state.num_question_asked == 1:
	with st.container():
	for msg in st.session_state.messages:
	with st.chat_message(msg["role"]):
	st.markdown(msg["content"], unsafe_allow_html=True)
	with st.spinner("Working …"):
	st.session_state.messages.append({
	"role": "assistant",
	"content": ResponseBuilderTool(st.session_state.first_question)
	})
	st.session_state.num_question_asked += 1
	st.rerun()

	else:
	with st.container():
	for msg in st.session_state.messages:
	with st.chat_message(msg["role"]):
	st.markdown(msg["content"], unsafe_allow_html=True)
	if user_q := st.chat_input("Ask about your data…"):
	st.session_state.messages.append({"role": "user", "content": user_q})
	with st.spinner("Working …"):
	st.session_state.messages.append({
	"role": "assistant",
	"content": ResponseBuilderTool(user_q)
	})
	st.session_state.num_question_asked += 1
	st.rerun()

	if __name__ == "__main__":
	main()