BI-dashboard / data_processing.py
antonymilne's picture
Tidy and a couple of new features (#2)
4cfea5b verified
"""Data processing functions for the Superstore BI dashboard."""
import random
import pandas as pd
import us
THIS_YEAR = 2017
LAST_YEAR = 2016
COLUMN_TO_METRIC = {"Sales": "Sales", "Profit": "Profit", "Order ID": "Orders", "Customer ID": "Customers"}
COLUMN_TO_AGGFUNC = {"Sales": "sum", "Profit": "sum", "Order ID": "nunique", "Customer ID": "nunique"}
def make_superstore_df():
"""Load and preprocess the Superstore dataset.
Returns:
pd.DataFrame: Processed dataframe with state codes, filtered to latest 2 years,
and enriched with Year, Month, and Order Status columns.
"""
df = pd.read_csv("superstore.csv", encoding="latin1", parse_dates=["Order Date", "Ship Date"])
# Map state names to state codes using us library. Needed for px.choropleth.
df["State Code"] = df["State"].map({state.name: state.abbr for state in us.states.STATES})
# Filter dataframe for only the latest 2 years and add a month column.
df["Year"] = df["Order Date"].dt.year
df["Month"] = df["Order Date"].dt.month
df = df[df["Year"].isin([THIS_YEAR, LAST_YEAR])]
# Create order status - randomly assign with weights: 60% Delivered, 10% In Transit, 30% Processing
df["Order Status"] = random.choices(["Delivered", "In Transit", "Processing"], weights=[0.6, 0.1, 0.3], k=len(df))
return df
def make_superstore_profit_df(df):
"""Aggregate product data by sub-category with profit metrics.
Args:
df: Source dataframe containing Sales and Profit columns.
Returns:
pd.DataFrame: Aggregated dataframe with Sub-Category, Sales, Profit,
Profit Margin, and Profit Absolute columns.
"""
df = df.groupby("Sub-Category", as_index=False).agg({"Sales": "sum", "Profit": "sum"})
df["Profit Margin"] = df["Profit"] / df["Sales"]
df["Profit Absolute"] = df["Profit"].abs()
return df
def make_customer_sales_pareto_df(df):
"""Create Pareto analysis dataframe for customer sales.
Args:
df: Source dataframe containing Customer Name and Sales columns.
Returns:
pd.DataFrame: Customer sales ranked with cumulative totals and percentages.
"""
df = df.groupby("Customer Name", as_index=False)["Sales"].sum().sort_values("Sales", ascending=False)
df["Cumulative Sales"] = df["Sales"].cumsum()
df["Cumulative % of Sales"] = 100 * df["Cumulative Sales"] / df["Sales"].sum()
df["Rank"] = range(1, len(df) + 1)
df["% of Total Customers"] = 100 * df["Rank"] / len(df)
return df[["Rank", "Customer Name", "Sales", "Cumulative Sales", "Cumulative % of Sales", "% of Total Customers"]]