Spaces:

vizro
/

BI-dashboard

Running

App Files Files Community

BI-dashboard / data_processing.py

antonymilne

Refactor and simplify Pareto chart with improved data processing

8ba00b6 about 2 months ago

raw

history blame

2.63 kB

	"""Data processing functions for the Superstore BI dashboard."""

	import random

	import pandas as pd
	import us

	THIS_YEAR = 2017
	LAST_YEAR = 2016
	COLUMN_TO_METRIC = {"Sales": "Sales", "Profit": "Profit", "Order ID": "Orders", "Customer ID": "Customers"}
	COLUMN_TO_AGGFUNC = {"Sales": "sum", "Profit": "sum", "Order ID": "nunique", "Customer ID": "nunique"}


	def make_superstore_df():
	"""Load and preprocess the Superstore dataset.

	Returns:
	pd.DataFrame: Processed dataframe with state codes, filtered to latest 2 years,
	and enriched with Year, Month, and Order Status columns.
	"""
	df = pd.read_csv("superstore.csv", encoding="latin1", parse_dates=["Order Date", "Ship Date"])

	# Map state names to state codes using us library. Needed for px.choropleth.
	df["State Code"] = df["State"].map({state.name: state.abbr for state in us.states.STATES})

	# Filter dataframe for only the latest 2 years and add a month column.
	df["Year"] = df["Order Date"].dt.year
	df["Month"] = df["Order Date"].dt.month
	df = df[df["Year"].isin([THIS_YEAR, LAST_YEAR])]

	# Create order status - randomly assign with weights: 60% Delivered, 10% In Transit, 30% Processing
	df["Order Status"] = random.choices(["Delivered", "In Transit", "Processing"], weights=[0.6, 0.1, 0.3], k=len(df))
	return df


	def make_superstore_profit_df(df):
	"""Aggregate product data by sub-category with profit metrics.

	Args:
	df: Source dataframe containing Sales and Profit columns.

	Returns:
	pd.DataFrame: Aggregated dataframe with Sub-Category, Sales, Profit,
	Profit Margin, and Profit Absolute columns.
	"""
	df = df.groupby("Sub-Category", as_index=False).agg({"Sales": "sum", "Profit": "sum"})
	df["Profit Margin"] = df["Profit"] / df["Sales"]
	df["Profit Absolute"] = df["Profit"].abs()
	return df


	def make_customer_sales_pareto_df(df):
	"""Create Pareto analysis dataframe for customer sales.

	Args:
	df: Source dataframe containing Customer Name and Sales columns.

	Returns:
	pd.DataFrame: Customer sales ranked with cumulative totals and percentages.
	"""
	df = df.groupby("Customer Name", as_index=False)["Sales"].sum().sort_values("Sales", ascending=False)
	df["Cumulative Sales"] = df["Sales"].cumsum()
	df["Cumulative % of Sales"] = 100 * df["Cumulative Sales"] / df["Sales"].sum()
	df["Rank"] = range(1, len(df) + 1)
	df["% of Total Customers"] = 100 * df["Rank"] / len(df)
	return df[["Rank", "Customer Name", "Sales", "Cumulative Sales", "Cumulative % of Sales", "% of Total Customers"]]