Spaces:

k-pavlo
/

excel-ai-analyzer

Sleeping

File size: 10,769 Bytes

7273bff
 
288f777
5ad7d73
c525991
c944dd0
 
c525991
c944dd0
5ad7d73
 
 
2c65316
7f64dba
 
 
 
 
2c65316
 
 
 
 
7f64dba
2c65316
7f64dba
 
fe06f0e
82bc971
 
 
 
 
 
 
 
 
62f3a56
 
 
82bc971
62f3a56
82bc971
 
c74321c
 
 
 
 
82bc971
c74321c
 
82bc971
 
 
fe06f0e
 
 
 
 
 
7f64dba
 
 
 
fe06f0e
 
7f64dba
5ad7d73
2c65316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ad7d73
 
7f64dba
13a0a1c
c74321c
 
 
 
 
 
 
 
 
 
75e1424
 
 
 
 
7f64dba
13a0a1c
7f64dba
a6bbf7e
 
 
 
 
7f64dba
5ad7d73
 
75e1424
5ad7d73
288f777
 
 
 
7f64dba
 
 
288f777
 
 
 
 
 
7f64dba
 
 
 
 
288f777
5ad7d73
372d86b
 
 
 
 
 
 
 
 
 
 
 
 
 
5ad7d73
b109b68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372d86b
 
 
5ad7d73
 
 
 
 
8f502b4
288f777
8f502b4
7f64dba
 
 
 
c74321c
7f64dba
8f502b4
 
c74321c
 
 
 
 
 
 
62f3a56
c74321c
8f502b4
 
 
 
 
c74321c
 
 
8f502b4
a6bbf7e
8f502b4
a6bbf7e
c74321c
 
 
 
 
8f502b4
7f64dba
 
 
c74321c
b109b68
5ad7d73
 
c74321c
414da23
c944dd0
c525991
c944dd0
 
 
 
bfac968
c944dd0
c525991
8f502b4
 
 
75e1424
 
29cc02b
75e1424
 
 
 
29cc02b
8f502b4
 
 
 
 
 
 
 
 
 
 
5ad7d73
 
8f502b4
8a23027
 
 
8f502b4
 
8a23027
288f777
5ad7d73
414da23
8f502b4
7273bff
5ad7d73
 
7273bff
8d3574a
 
 
 
 
5ad7d73
8d3574a
5ad7d73
 
 
 
 
 
7273bff
5ad7d73
8d3574a
7273bff
5ad7d73
 
c002d00

import pandas as pd
import gradio as gr
import numpy as np
import os
from openai import OpenAI  # requires OPENAI_API_KEY set as env var

# 1. Configure OpenAI
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
MODEL_NAME = "gpt-5"


# 2. Load Excel data
# Load all sheets
df1_all = pd.read_excel(
    os.path.join("data_source", "OC Onboarding Information.xlsx"),
    sheet_name=None,
    header=None  # load raw to inspect
)
df2_all = pd.read_excel(
    os.path.join("data_source", "The Alex Ideas Report.xlsx"),
    sheet_name=None,
    header=None  # load raw to inspect
)

# Handle special header cases
df1 = {}
for sheet, raw_df in df1_all.items():
    if sheet == "Budget":
        # Load raw first
        budget_raw = pd.read_excel(
            os.path.join("data_source", "OC Onboarding Information.xlsx"),
            sheet_name="Budget",
            header=None
        )

        # Part 1: detailed breakdown (rows 3–20 with headers at row 3)
        budget_details = pd.read_excel(
            os.path.join("data_source", "OC Onboarding Information.xlsx"),
            sheet_name="Budget",
            header=2,
            nrows=17  # rows 3–19 (inclusive)
        )

        # Part 2: summary section (rows 21–33)
        headers = budget_raw.iloc[2, 4:].tolist()   # row3, cols E+ as headers
        metrics = budget_raw.iloc[20:33, 1].tolist()  # colB = row labels
        values = budget_raw.iloc[20:33, 4:].reset_index(drop=True)
        values.columns = headers
        values.insert(0, "Metric", metrics)

        budget_summary = values
        
        # Add into df1 dict
        df1["Budget_Details"] = budget_details
        df1["Budget_Summary"] = budget_summary
    elif sheet == "Rooms per category":
        # use row 4 as header
        df1[sheet] = pd.read_excel(
            os.path.join("data_source", "OC Onboarding Information.xlsx"),
            sheet_name=sheet,
            header=3
        )
    else:
        df1[sheet] = pd.read_excel(
            os.path.join("data_source", "OC Onboarding Information.xlsx"),
            sheet_name=sheet,
            header=1
        )

df2 = {}
for sheet, raw_df in df2_all.items():
    if sheet == "Report Criteria":
        # use row 3 as header
        df2[sheet] = pd.read_excel(
            os.path.join("data_source", "The Alex Ideas Report.xlsx"),
            sheet_name=sheet,
            header=2
        )
    else:
        df2[sheet] = pd.read_excel(
            os.path.join("data_source", "The Alex Ideas Report.xlsx"),
            sheet_name=sheet
        )

# Build schema info for prompts
def get_schema_info():
    lines = ["Report 1 - OC Onboarding Information:"]
    for sheet, df in df1.items():
        if sheet == "Budget_Details":
            lines.append("Sheet: Budget_Details (line-item breakdown by channel/segment)")
            lines.append(f"Columns: {list(df.columns)}")
        elif sheet == "Budget_Summary":
            lines.append("Sheet: Budget_Summary (metrics by month; first column = 'Metric')")
            lines.append(f"Metrics available: {df['Metric'].tolist()}")
            lines.append(f"Month columns: {[c for c in df.columns if c != 'Metric']}")
        else:
            lines.append(f"Sheet: {sheet}, Columns: {list(df.columns)}")
        # Add one example row for context
        try:
            sample = df.head(1).to_dict(orient="records")[0]
            lines.append(f"Example row: {sample}")
        except Exception:
            lines.append("Example row: [no data available]")
    lines.append("\nReport 2 - The Alex Ideas Report:")
    for sheet, df in df2.items():
        lines.append(f"Sheet: {sheet}, Columns: {list(df.columns)}")
        try:
            sample = df.head(1).to_dict(orient="records")[0]
            lines.append(f"Example row: {sample}")
        except Exception:
            lines.append("Example row: [no data available]")
    return "\n".join(lines)

schema_info = get_schema_info()
print("Prompt size (chars):", len(schema_info))

# Helper to format results nicely
def format_result(result):
    # Convert numpy scalars
    if isinstance(result, np.generic):
        return round(result.item(), 2)
    if isinstance(result, (int, float)):
        return round(result, 2)
    # Convert dicts into readable strings
    if isinstance(result, dict):
        return "\n".join([f"{k}: {format_result(v)}" for k, v in result.items()])
    # Convert lists into comma-separated string
    if isinstance(result, list):
        return ", ".join(map(str, result))
    if isinstance(result, pd.Series):
        return result.to_string()
    if isinstance(result, pd.DataFrame):
        return result.head().to_string(index=False)
    return str(result)

# 3. Core function
MAX_REQUESTS_PER_DAY = 100  # Set your desired limit
USAGE_FILE = "openai_usage.txt"

def get_usage_count():
    if os.path.exists(USAGE_FILE):
        with open(USAGE_FILE, "r") as f:
            return int(f.read().strip())
    return 0

def increment_usage_count():
    count = get_usage_count() + 1
    with open(USAGE_FILE, "w") as f:
        f.write(str(count))

def answer_question(history, message):
    # Define a restricted set of built-ins
    safe_builtins = {
        "abs": abs,
        "all": all,
        "any": any,
        "bool": bool,
        "dict": dict,
        "float": float,
        "int": int,
        "len": len,
        "list": list,
        "max": max,
        "min": min,
        "range": range,
        "str": str,
        "sum": sum,
        "round": round,
        "KeyError": KeyError,
        "ValueError": ValueError,
        "sorted": sorted,
        "enumerate": enumerate,
        "zip": zip,
        "set": set,
        "type": type,
        "isinstance": isinstance,
        "pd": pd,
        "np": np,
        "print": print,
        "Exception": Exception
    }
    if get_usage_count() >= MAX_REQUESTS_PER_DAY:
        return "OpenAI API usage limit reached for today."
    increment_usage_count()
    """
    history: chat history (list of [user, assistant] pairs)
    message: latest user message (string)
    """
    prompt = f"""
You are a data analysis assistant. 
You can ONLY answer questions using the two Excel reports provided (df1 and df2).
Do not hallucinate or use external knowledge.
The reports are loaded as dictionaries of DataFrames:
- Access Report 1 with df1['SheetName']
- Access Report 2 with df2['SheetName']
Do not reload Excel files with pandas.
If unsure whether the question is relevant, try to reason using the available columns.
If absolutely no relation to provided sheets, respond with:
"I can only answer questions about the provided Excel reports."

The reports are:
- OC Onboarding Information (df1): planned hotel data including a budget set by the hotel for the year.
    - "Budget_Details": line-item breakdown (channels, segments, rates, rooms, revenue, ADR).
    - "Budget_Summary": pivot-style table with 'Metric' as the first column 
      (e.g. Total, Occupancy %, RevPar, Capacity), and the other columns representing monthly values 
      such as 'Jan Rooms', 'Jan Rooms Revenue', 'Jan ADR', 'Feb Rooms', etc.
- The Alex Ideas Report (df2): actual hotel revenue and performance data.

They have the following schema (sheet names, columns, and example rows):
{schema_info}

The user asked:
{message}

The history of the conversation is:
{history}

Rules:
- Use ONLY pandas, df1, df2, and Python built-ins.
- Do NOT write import statements (pandas is already imported as pd).
- Access all dataframes ONLY as df1["SheetName"] or df2["SheetName"]. 
  Never assign them to new variables (e.g. budget_df, property_df).
- Do not create aliases for dataframes. Always reference them directly.
- When using df1["Budget_Summary"], always filter by 'Metric' first, 
  then select the appropriate month column.
- ALWAYS put the final answer in a variable named `result`.
- Return ONLY Python code, nothing else.
- If multiple values are tied for the maximum, include all of them in a list.
- If result is numeric, round to 2 decimal places.
- If result is a list, return the full list (not just the first element).
- If a required column or metric is missing, return a clear error string in `result` instead of crashing.
- List of allowed functions: {list(safe_builtins.keys())}
    """


    try:
        # Ask OpenAI
        response = client.chat.completions.create(
            model=MODEL_NAME,
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ]
        )
        code = response.choices[0].message.content.strip()
        # Strip markdown fences if any
        if code.startswith("```"):
            code = code.strip("```").replace("python", "").strip()
        
        # Debug info
        print("RAW GPT CODE:\n", code)
        print("Prompt tokens:", response.usage.prompt_tokens)
        print("Completion tokens:", response.usage.completion_tokens)
        print("Finish reason:", response.choices[0].finish_reason)


        # Remove dangerous or irrelevant lines
        safe_lines = []
        for line in code.splitlines():
            if line.strip().startswith("import"):
                continue
            if line.strip().startswith("!"):  # e.g. shell commands
                continue
            safe_lines.append(line)
        safe_code = "\n".join(safe_lines)

        # Prepare sandbox
        local_vars = {"df1": df1, "df2": df2, "pd": pd}

        # Execute code
        safe_globals = {"__builtins__": safe_builtins, "df1": df1, "df2": df2, "pd": pd}
        local_vars = {}
        exec(safe_code, safe_globals, local_vars)

        # Fetch result
        result = local_vars.get("result", safe_globals.get("result", "No result produced"))
        return str(format_result(result))

    except Exception as e:
        return f"Execution error: {e}"


# 5. Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("""
    # 🏨 Excel Hotel Data Chatbot
    Ask any question about the two provided Excel reports. The assistant will analyze the hotel data and answer using the loaded sheets.
    """)
    chatbot = gr.Chatbot(height=500, show_copy_button=True)
    msg = gr.Textbox(placeholder="Ask me a question about the reports...")
    send_btn = gr.Button("Send")
    clear = gr.ClearButton([msg, chatbot])

    def respond(message, chat_history):
        answer = answer_question(chat_history, message)
        chat_history.append((message, answer))
        return "", chat_history

    msg.submit(respond, [msg, chatbot], [msg, chatbot])
    send_btn.click(respond, [msg, chatbot], [msg, chatbot])

# 6. Run locally (Spaces will call demo.launch() automatically)
if __name__ == "__main__":
    demo.launch()