Spaces:

k-pavlo
/

excel-ai-analyzer

Sleeping

Sw1ft0

Expand safe_builtins and add them in the prompt

b109b68 3 months ago

10.8 kB

	import pandas as pd
	import gradio as gr
	import numpy as np
	import os
	from openai import OpenAI # requires OPENAI_API_KEY set as env var

	# 1. Configure OpenAI
	client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
	MODEL_NAME = "gpt-5"


	# 2. Load Excel data
	# Load all sheets
	df1_all = pd.read_excel(
	os.path.join("data_source", "OC Onboarding Information.xlsx"),
	sheet_name=None,
	header=None # load raw to inspect
	)
	df2_all = pd.read_excel(
	os.path.join("data_source", "The Alex Ideas Report.xlsx"),
	sheet_name=None,
	header=None # load raw to inspect
	)

	# Handle special header cases
	df1 = {}
	for sheet, raw_df in df1_all.items():
	if sheet == "Budget":
	# Load raw first
	budget_raw = pd.read_excel(
	os.path.join("data_source", "OC Onboarding Information.xlsx"),
	sheet_name="Budget",
	header=None
	)

	# Part 1: detailed breakdown (rows 3–20 with headers at row 3)
	budget_details = pd.read_excel(
	os.path.join("data_source", "OC Onboarding Information.xlsx"),
	sheet_name="Budget",
	header=2,
	nrows=17 # rows 3–19 (inclusive)
	)

	# Part 2: summary section (rows 21–33)
	headers = budget_raw.iloc[2, 4:].tolist() # row3, cols E+ as headers
	metrics = budget_raw.iloc[20:33, 1].tolist() # colB = row labels
	values = budget_raw.iloc[20:33, 4:].reset_index(drop=True)
	values.columns = headers
	values.insert(0, "Metric", metrics)

	budget_summary = values

	# Add into df1 dict
	df1["Budget_Details"] = budget_details
	df1["Budget_Summary"] = budget_summary
	elif sheet == "Rooms per category":
	# use row 4 as header
	df1[sheet] = pd.read_excel(
	os.path.join("data_source", "OC Onboarding Information.xlsx"),
	sheet_name=sheet,
	header=3
	)
	else:
	df1[sheet] = pd.read_excel(
	os.path.join("data_source", "OC Onboarding Information.xlsx"),
	sheet_name=sheet,
	header=1
	)

	df2 = {}
	for sheet, raw_df in df2_all.items():
	if sheet == "Report Criteria":
	# use row 3 as header
	df2[sheet] = pd.read_excel(
	os.path.join("data_source", "The Alex Ideas Report.xlsx"),
	sheet_name=sheet,
	header=2
	)
	else:
	df2[sheet] = pd.read_excel(
	os.path.join("data_source", "The Alex Ideas Report.xlsx"),
	sheet_name=sheet
	)

	# Build schema info for prompts
	def get_schema_info():
	lines = ["Report 1 - OC Onboarding Information:"]
	for sheet, df in df1.items():
	if sheet == "Budget_Details":
	lines.append("Sheet: Budget_Details (line-item breakdown by channel/segment)")
	lines.append(f"Columns: {list(df.columns)}")
	elif sheet == "Budget_Summary":
	lines.append("Sheet: Budget_Summary (metrics by month; first column = 'Metric')")
	lines.append(f"Metrics available: {df['Metric'].tolist()}")
	lines.append(f"Month columns: {[c for c in df.columns if c != 'Metric']}")
	else:
	lines.append(f"Sheet: {sheet}, Columns: {list(df.columns)}")
	# Add one example row for context
	try:
	sample = df.head(1).to_dict(orient="records")[0]
	lines.append(f"Example row: {sample}")
	except Exception:
	lines.append("Example row: [no data available]")
	lines.append("\nReport 2 - The Alex Ideas Report:")
	for sheet, df in df2.items():
	lines.append(f"Sheet: {sheet}, Columns: {list(df.columns)}")
	try:
	sample = df.head(1).to_dict(orient="records")[0]
	lines.append(f"Example row: {sample}")
	except Exception:
	lines.append("Example row: [no data available]")
	return "\n".join(lines)

	schema_info = get_schema_info()
	print("Prompt size (chars):", len(schema_info))

	# Helper to format results nicely
	def format_result(result):
	# Convert numpy scalars
	if isinstance(result, np.generic):
	return round(result.item(), 2)
	if isinstance(result, (int, float)):
	return round(result, 2)
	# Convert dicts into readable strings
	if isinstance(result, dict):
	return "\n".join([f"{k}: {format_result(v)}" for k, v in result.items()])
	# Convert lists into comma-separated string
	if isinstance(result, list):
	return ", ".join(map(str, result))
	if isinstance(result, pd.Series):
	return result.to_string()
	if isinstance(result, pd.DataFrame):
	return result.head().to_string(index=False)
	return str(result)

	# 3. Core function
	MAX_REQUESTS_PER_DAY = 100 # Set your desired limit
	USAGE_FILE = "openai_usage.txt"

	def get_usage_count():
	if os.path.exists(USAGE_FILE):
	with open(USAGE_FILE, "r") as f:
	return int(f.read().strip())
	return 0

	def increment_usage_count():
	count = get_usage_count() + 1
	with open(USAGE_FILE, "w") as f:
	f.write(str(count))

	def answer_question(history, message):
	# Define a restricted set of built-ins
	safe_builtins = {
	"abs": abs,
	"all": all,
	"any": any,
	"bool": bool,
	"dict": dict,
	"float": float,
	"int": int,
	"len": len,
	"list": list,
	"max": max,
	"min": min,
	"range": range,
	"str": str,
	"sum": sum,
	"round": round,
	"KeyError": KeyError,
	"ValueError": ValueError,
	"sorted": sorted,
	"enumerate": enumerate,
	"zip": zip,
	"set": set,
	"type": type,
	"isinstance": isinstance,
	"pd": pd,
	"np": np,
	"print": print,
	"Exception": Exception
	}
	if get_usage_count() >= MAX_REQUESTS_PER_DAY:
	return "OpenAI API usage limit reached for today."
	increment_usage_count()
	"""
	history: chat history (list of [user, assistant] pairs)
	message: latest user message (string)
	"""
	prompt = f"""
	You are a data analysis assistant.
	You can ONLY answer questions using the two Excel reports provided (df1 and df2).
	Do not hallucinate or use external knowledge.
	The reports are loaded as dictionaries of DataFrames:
	- Access Report 1 with df1['SheetName']
	- Access Report 2 with df2['SheetName']
	Do not reload Excel files with pandas.
	If unsure whether the question is relevant, try to reason using the available columns.
	If absolutely no relation to provided sheets, respond with:
	"I can only answer questions about the provided Excel reports."

	The reports are:
	- OC Onboarding Information (df1): planned hotel data including a budget set by the hotel for the year.
	- "Budget_Details": line-item breakdown (channels, segments, rates, rooms, revenue, ADR).
	- "Budget_Summary": pivot-style table with 'Metric' as the first column
	(e.g. Total, Occupancy %, RevPar, Capacity), and the other columns representing monthly values
	such as 'Jan Rooms', 'Jan Rooms Revenue', 'Jan ADR', 'Feb Rooms', etc.
	- The Alex Ideas Report (df2): actual hotel revenue and performance data.

	They have the following schema (sheet names, columns, and example rows):
	{schema_info}

	The user asked:
	{message}

	The history of the conversation is:
	{history}

	Rules:
	- Use ONLY pandas, df1, df2, and Python built-ins.
	- Do NOT write import statements (pandas is already imported as pd).
	- Access all dataframes ONLY as df1["SheetName"] or df2["SheetName"].
	Never assign them to new variables (e.g. budget_df, property_df).
	- Do not create aliases for dataframes. Always reference them directly.
	- When using df1["Budget_Summary"], always filter by 'Metric' first,
	then select the appropriate month column.
	- ALWAYS put the final answer in a variable named `result`.
	- Return ONLY Python code, nothing else.
	- If multiple values are tied for the maximum, include all of them in a list.
	- If result is numeric, round to 2 decimal places.
	- If result is a list, return the full list (not just the first element).
	- If a required column or metric is missing, return a clear error string in `result` instead of crashing.
	- List of allowed functions: {list(safe_builtins.keys())}
	"""


	try:
	# Ask OpenAI
	response = client.chat.completions.create(
	model=MODEL_NAME,
	messages=[
	{"role": "system", "content": "You are a helpful assistant."},
	{"role": "user", "content": prompt}
	]
	)
	code = response.choices[0].message.content.strip()
	# Strip markdown fences if any
	if code.startswith("```"):
	code = code.strip("```").replace("python", "").strip()

	# Debug info
	print("RAW GPT CODE:\n", code)
	print("Prompt tokens:", response.usage.prompt_tokens)
	print("Completion tokens:", response.usage.completion_tokens)
	print("Finish reason:", response.choices[0].finish_reason)


	# Remove dangerous or irrelevant lines
	safe_lines = []
	for line in code.splitlines():
	if line.strip().startswith("import"):
	continue
	if line.strip().startswith("!"): # e.g. shell commands
	continue
	safe_lines.append(line)
	safe_code = "\n".join(safe_lines)

	# Prepare sandbox
	local_vars = {"df1": df1, "df2": df2, "pd": pd}

	# Execute code
	safe_globals = {"__builtins__": safe_builtins, "df1": df1, "df2": df2, "pd": pd}
	local_vars = {}
	exec(safe_code, safe_globals, local_vars)

	# Fetch result
	result = local_vars.get("result", safe_globals.get("result", "No result produced"))
	return str(format_result(result))

	except Exception as e:
	return f"Execution error: {e}"


	# 5. Gradio UI
	with gr.Blocks() as demo:
	gr.Markdown("""
	# 🏨 Excel Hotel Data Chatbot
	Ask any question about the two provided Excel reports. The assistant will analyze the hotel data and answer using the loaded sheets.
	""")
	chatbot = gr.Chatbot(height=500, show_copy_button=True)
	msg = gr.Textbox(placeholder="Ask me a question about the reports...")
	send_btn = gr.Button("Send")
	clear = gr.ClearButton([msg, chatbot])

	def respond(message, chat_history):
	answer = answer_question(chat_history, message)
	chat_history.append((message, answer))
	return "", chat_history

	msg.submit(respond, [msg, chatbot], [msg, chatbot])
	send_btn.click(respond, [msg, chatbot], [msg, chatbot])

	# 6. Run locally (Spaces will call demo.launch() automatically)
	if __name__ == "__main__":
	demo.launch()