Spaces:

k-pavlo
/

excel-ai-analyzer

Sleeping

Sw1ft0

Add special header case handling to data loading section. Append schema with example rows to be passed to Gemini for context. Improve the prompt and format_result function.

7f64dba 3 months ago

raw

history blame

5.22 kB

	import pandas as pd
	import gradio as gr
	import numpy as np
	import os
	import google.generativeai as genai # requires GOOGLE_API_KEY set as env var

	# 1. Configure Gemini
	genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
	model = genai.GenerativeModel("gemini-2.5-pro")

	# 2. Load Excel data
	df2 = pd.read_excel(os.path.join("data_source", "The Alex Ideas Report.xlsx"), sheet_name=None)
	# Load all sheets, handle special header case
	df1_all = pd.read_excel(
	os.path.join("data_source", "OC Onboarding Information.xlsx"),
	sheet_name=None,
	header=None # load raw to inspect
	)

	df1 = {}
	for sheet, raw_df in df1_all.items():
	if sheet == "PY Event Diary":
	# use row 2 as header
	df1[sheet] = pd.read_excel(
	os.path.join("data_source", "OC Onboarding Information.xlsx"),
	sheet_name=sheet,
	header=1
	)
	else:
	df1[sheet] = pd.read_excel(
	os.path.join("data_source", "OC Onboarding Information.xlsx"),
	sheet_name=sheet
	)

	# Build schema info for prompts
	def get_schema_info():
	lines = ["Report 1 - OC Onboarding Information:"]
	for sheet, df in df1.items():
	lines.append(f"Sheet: {sheet}, Columns: {list(df.columns)}")
	sample = df.head(1).to_dict(orient="records")[0]
	lines.append(f"Example row: {sample}")
	lines.append("\nReport 2 - The Alex Ideas Report:")
	for sheet, df in df2.items():
	lines.append(f"Sheet: {sheet}, Columns: {list(df.columns)}")
	sample = df.head(1).to_dict(orient="records")[0]
	lines.append(f"Example row: {sample}")
	return "\n".join(lines)

	schema_info = get_schema_info()

	# Helper to format results nicely
	def format_result(result):
	# Convert numpy scalars
	if isinstance(result, np.generic):
	return round(result.item(), 2)
	if isinstance(result, (int, float)):
	return round(result, 2)
	# Convert dicts into readable strings
	if isinstance(result, dict):
	return "\n".join([f"{k}: {format_result(v)}" for k, v in result.items()])
	# Convert lists into comma-separated string
	if isinstance(result, list):
	return ", ".join(map(str, result))
	if isinstance(result, pd.Series):
	return result.to_string()
	if isinstance(result, pd.DataFrame):
	return result.head().to_string(index=False)
	return str(result)

	# 3. Core function
	def answer_question(history, message):
	"""
	history: chat history (list of [user, assistant] pairs)
	message: latest user message (string)
	"""
	prompt = f"""
	You are a data analysis assistant.
	You can ONLY answer questions using the two Excel reports provided (df1 and df2).
	Do not hallucinate or use external knowledge.
	The reports are loaded as dictionaries of DataFrames:
	- Access Report 1 with df1['SheetName']
	- Access Report 2 with df2['SheetName']
	Do not reload Excel files with pandas.
	If unsure is the question relevant, try to reason using columns available.
	If absolutely no relation to provided sheets, respond with:
	"I can only answer questions about the provided Excel reports."

	The reports have the following schema:
	{schema_info}

	The user asked:
	{message}

	Rules:
	- Use only pandas, df1, df2, and Python built-ins.
	- Do NOT write import statements (pandas is already imported as pd).
	- Always put the answer in a variable named `result`.
	- Return ONLY Python code, nothing else.
	- If multiple values are tied for the maximum, include all of them in a list.
	- If result is numeric, round to 2 decimal places.
	- If result is a list, return the full list (not just the first element).
	- If a column is missing, return a clear error string in `result`, do not crash.
	"""

	try:
	# Ask Gemini
	response = model.generate_content(prompt)
	code = response.text.strip()

	# Strip markdown fences if any
	if code.startswith("```"):
	code = code.strip("```").replace("python", "").strip()

	# Remove dangerous or irrelevant lines
	safe_lines = []
	for line in code.splitlines():
	if line.strip().startswith("import"):
	continue
	if line.strip().startswith("!"): # e.g. shell commands
	continue
	safe_lines.append(line)
	safe_code = "\n".join(safe_lines)

	# Prepare sandbox
	local_vars = {"df1": df1, "df2": df2, "pd": pd}

	# Execute code
	exec(safe_code, {"__builtins__": {}}, local_vars)

	# Fetch result
	result = local_vars.get("result", "No result produced")
	return str(format_result(result))

	except Exception as e:
	return f"Execution error: {e}"


	# 5. Gradio UI
	with gr.Blocks() as demo:
	chatbot = gr.Chatbot()
	msg = gr.Textbox(placeholder="Ask me a question about the reports...")
	clear = gr.ClearButton([msg, chatbot])

	def respond(message, chat_history):
	answer = answer_question(chat_history, message)
	chat_history.append((message, answer))
	return "", chat_history

	msg.submit(respond, [msg, chatbot], [msg, chatbot])

	# 6. Run locally (Spaces will call demo.launch() automatically)
	if __name__ == "__main__":
	demo.launch()