"""Data processing functions for the Superstore BI dashboard.""" import random import pandas as pd import us THIS_YEAR = 2017 LAST_YEAR = 2016 COLUMN_TO_METRIC = {"Sales": "Sales", "Profit": "Profit", "Order ID": "Orders", "Customer ID": "Customers"} COLUMN_TO_AGGFUNC = {"Sales": "sum", "Profit": "sum", "Order ID": "nunique", "Customer ID": "nunique"} def make_superstore_df(): """Load and preprocess the Superstore dataset. Returns: pd.DataFrame: Processed dataframe with state codes, filtered to latest 2 years, and enriched with Year, Month, and Order Status columns. """ df = pd.read_csv("superstore.csv", encoding="latin1", parse_dates=["Order Date", "Ship Date"]) # Map state names to state codes using us library. Needed for px.choropleth. df["State Code"] = df["State"].map({state.name: state.abbr for state in us.states.STATES}) # Filter dataframe for only the latest 2 years and add a month column. df["Year"] = df["Order Date"].dt.year df["Month"] = df["Order Date"].dt.month df = df[df["Year"].isin([THIS_YEAR, LAST_YEAR])] # Create order status - randomly assign with weights: 60% Delivered, 10% In Transit, 30% Processing df["Order Status"] = random.choices(["Delivered", "In Transit", "Processing"], weights=[0.6, 0.1, 0.3], k=len(df)) return df def make_superstore_profit_df(df): """Aggregate product data by sub-category with profit metrics. Args: df: Source dataframe containing Sales and Profit columns. Returns: pd.DataFrame: Aggregated dataframe with Sub-Category, Sales, Profit, Profit Margin, and Profit Absolute columns. """ df = df.groupby("Sub-Category", as_index=False).agg({"Sales": "sum", "Profit": "sum"}) df["Profit Margin"] = df["Profit"] / df["Sales"] df["Profit Absolute"] = df["Profit"].abs() return df def make_customer_sales_pareto_df(df): """Create Pareto analysis dataframe for customer sales. Args: df: Source dataframe containing Customer Name and Sales columns. Returns: pd.DataFrame: Customer sales ranked with cumulative totals and percentages. """ df = df.groupby("Customer Name", as_index=False)["Sales"].sum().sort_values("Sales", ascending=False) df["Cumulative Sales"] = df["Sales"].cumsum() df["Cumulative % of Sales"] = 100 * df["Cumulative Sales"] / df["Sales"].sum() df["Rank"] = range(1, len(df) + 1) df["% of Total Customers"] = 100 * df["Rank"] / len(df) return df[["Rank", "Customer Name", "Sales", "Cumulative Sales", "Cumulative % of Sales", "% of Total Customers"]]