BI-dashboard / data_processing.py
antonymilne's picture
Refactor and simplify Pareto chart with improved data processing
8ba00b6
raw
history blame
2.63 kB
"""Data processing functions for the Superstore BI dashboard."""
import random
import pandas as pd
import us
THIS_YEAR = 2017
LAST_YEAR = 2016
COLUMN_TO_METRIC = {"Sales": "Sales", "Profit": "Profit", "Order ID": "Orders", "Customer ID": "Customers"}
COLUMN_TO_AGGFUNC = {"Sales": "sum", "Profit": "sum", "Order ID": "nunique", "Customer ID": "nunique"}
def make_superstore_df():
"""Load and preprocess the Superstore dataset.
Returns:
pd.DataFrame: Processed dataframe with state codes, filtered to latest 2 years,
and enriched with Year, Month, and Order Status columns.
"""
df = pd.read_csv("superstore.csv", encoding="latin1", parse_dates=["Order Date", "Ship Date"])
# Map state names to state codes using us library. Needed for px.choropleth.
df["State Code"] = df["State"].map({state.name: state.abbr for state in us.states.STATES})
# Filter dataframe for only the latest 2 years and add a month column.
df["Year"] = df["Order Date"].dt.year
df["Month"] = df["Order Date"].dt.month
df = df[df["Year"].isin([THIS_YEAR, LAST_YEAR])]
# Create order status - randomly assign with weights: 60% Delivered, 10% In Transit, 30% Processing
df["Order Status"] = random.choices(["Delivered", "In Transit", "Processing"], weights=[0.6, 0.1, 0.3], k=len(df))
return df
def make_superstore_profit_df(df):
"""Aggregate product data by sub-category with profit metrics.
Args:
df: Source dataframe containing Sales and Profit columns.
Returns:
pd.DataFrame: Aggregated dataframe with Sub-Category, Sales, Profit,
Profit Margin, and Profit Absolute columns.
"""
df = df.groupby("Sub-Category", as_index=False).agg({"Sales": "sum", "Profit": "sum"})
df["Profit Margin"] = df["Profit"] / df["Sales"]
df["Profit Absolute"] = df["Profit"].abs()
return df
def make_customer_sales_pareto_df(df):
"""Create Pareto analysis dataframe for customer sales.
Args:
df: Source dataframe containing Customer Name and Sales columns.
Returns:
pd.DataFrame: Customer sales ranked with cumulative totals and percentages.
"""
df = df.groupby("Customer Name", as_index=False)["Sales"].sum().sort_values("Sales", ascending=False)
df["Cumulative Sales"] = df["Sales"].cumsum()
df["Cumulative % of Sales"] = 100 * df["Cumulative Sales"] / df["Sales"].sum()
df["Rank"] = range(1, len(df) + 1)
df["% of Total Customers"] = 100 * df["Rank"] / len(df)
return df[["Rank", "Customer Name", "Sales", "Cumulative Sales", "Cumulative % of Sales", "% of Total Customers"]]