Spaces:
Running
Running
| """Data processing functions for the Superstore BI dashboard.""" | |
| import random | |
| import pandas as pd | |
| import us | |
| THIS_YEAR = 2017 | |
| LAST_YEAR = 2016 | |
| COLUMN_TO_METRIC = {"Sales": "Sales", "Profit": "Profit", "Order ID": "Orders", "Customer ID": "Customers"} | |
| COLUMN_TO_AGGFUNC = {"Sales": "sum", "Profit": "sum", "Order ID": "nunique", "Customer ID": "nunique"} | |
| def make_superstore_df(): | |
| """Load and preprocess the Superstore dataset. | |
| Returns: | |
| pd.DataFrame: Processed dataframe with state codes, filtered to latest 2 years, | |
| and enriched with Year, Month, and Order Status columns. | |
| """ | |
| df = pd.read_csv("superstore.csv", encoding="latin1", parse_dates=["Order Date", "Ship Date"]) | |
| # Map state names to state codes using us library. Needed for px.choropleth. | |
| df["State Code"] = df["State"].map({state.name: state.abbr for state in us.states.STATES}) | |
| # Filter dataframe for only the latest 2 years and add a month column. | |
| df["Year"] = df["Order Date"].dt.year | |
| df["Month"] = df["Order Date"].dt.month | |
| df = df[df["Year"].isin([THIS_YEAR, LAST_YEAR])] | |
| # Create order status - randomly assign with weights: 60% Delivered, 10% In Transit, 30% Processing | |
| df["Order Status"] = random.choices(["Delivered", "In Transit", "Processing"], weights=[0.6, 0.1, 0.3], k=len(df)) | |
| return df | |
| def make_superstore_profit_df(df): | |
| """Aggregate product data by sub-category with profit metrics. | |
| Args: | |
| df: Source dataframe containing Sales and Profit columns. | |
| Returns: | |
| pd.DataFrame: Aggregated dataframe with Sub-Category, Sales, Profit, | |
| Profit Margin, and Profit Absolute columns. | |
| """ | |
| df = df.groupby("Sub-Category", as_index=False).agg({"Sales": "sum", "Profit": "sum"}) | |
| df["Profit Margin"] = df["Profit"] / df["Sales"] | |
| df["Profit Absolute"] = df["Profit"].abs() | |
| return df | |
| def make_customer_sales_pareto_df(df): | |
| """Create Pareto analysis dataframe for customer sales. | |
| Args: | |
| df: Source dataframe containing Customer Name and Sales columns. | |
| Returns: | |
| pd.DataFrame: Customer sales ranked with cumulative totals and percentages. | |
| """ | |
| df = df.groupby("Customer Name", as_index=False)["Sales"].sum().sort_values("Sales", ascending=False) | |
| df["Cumulative Sales"] = df["Sales"].cumsum() | |
| df["Cumulative % of Sales"] = 100 * df["Cumulative Sales"] / df["Sales"].sum() | |
| df["Rank"] = range(1, len(df) + 1) | |
| df["% of Total Customers"] = 100 * df["Rank"] / len(df) | |
| return df[["Rank", "Customer Name", "Sales", "Cumulative Sales", "Cumulative % of Sales", "% of Total Customers"]] | |