Spaces:
Running
Running
| import io | |
| import zipfile | |
| import pandas as pd | |
| import streamlit as st | |
| from src.constants import EXCLUDED_WEBSITES, ORDERS_CSV_ENTRY, REFUNDS_CSV_ENTRY | |
| # Tolerance (USD) for matching a refund row to a specific line item by amount. | |
| # Refund Details.csv has Order ID but no ASIN, so to flag the actual returned | |
| # line in a multi-item order we match on Refund Amount ≈ line Total Amount. | |
| # Set generously enough to absorb rounding/tax variation; tight enough to avoid | |
| # matching cheaper items in the same order. Refunds outside this tolerance | |
| # (partial refunds, restocking fees, shipping-only) are left un-flagged. | |
| REFUND_MATCH_TOLERANCE = 0.5 | |
| def _match_refunds_to_lines(orders: pd.DataFrame, refunds: pd.DataFrame) -> set: | |
| refunded_idx: set = set() | |
| orders_by_id = {oid: g for oid, g in orders.groupby("Order ID")} | |
| for order_id, group in refunds.groupby("Order ID"): | |
| if order_id not in orders_by_id: | |
| continue | |
| unmatched = {idx: amt for idx, amt in orders_by_id[order_id]["Total Amount"].items()} | |
| # Greedy largest-first: bigger refunds are less ambiguous, so match them | |
| # before they get stolen by a near-tie smaller line. | |
| for refund_amt in sorted(group["Refund Amount"], reverse=True): | |
| best_idx, best_diff = None, REFUND_MATCH_TOLERANCE | |
| for idx, amt in unmatched.items(): | |
| d = abs(amt - refund_amt) | |
| if d <= best_diff: | |
| best_diff, best_idx = d, idx | |
| if best_idx is not None: | |
| refunded_idx.add(best_idx) | |
| del unmatched[best_idx] | |
| return refunded_idx | |
| def load_data(zip_bytes: bytes) -> tuple[pd.DataFrame, pd.DataFrame]: | |
| with zipfile.ZipFile(io.BytesIO(zip_bytes)) as z: | |
| with z.open(ORDERS_CSV_ENTRY) as f: | |
| orders = pd.read_csv(f) | |
| with z.open(REFUNDS_CSV_ENTRY) as f: | |
| refunds = pd.read_csv(f) | |
| orders = orders[orders["Order Status"] != "Cancelled"] | |
| # Exclude in-store / grocery channels (Whole Foods is `panda01`, plus the | |
| # cashier-less `Amazon Go`) — they're physical-store scans, not Amazon | |
| # online orders, and Whole Foods produce in particular collapses badly | |
| # under groupby because most rows share the `_ASINLESS_` sentinel ASIN. | |
| orders = orders[~orders["Website"].isin(EXCLUDED_WEBSITES)].copy() | |
| orders["Order Date"] = pd.to_datetime(orders["Order Date"], utc=True) | |
| orders["Total Amount"] = ( | |
| orders["Total Amount"].astype(str).str.replace(",", "", regex=False).astype(float) | |
| ) | |
| # Drop $0 lines: item-level cancellations within a multi-item order | |
| # (Original Quantity = 0, order itself stayed Closed), free replacements | |
| # for damaged/missing goods, fully-discounted promos, and Prime | |
| # Try-Before-You-Buy returns. None reflect real spending and they pollute | |
| # the "Most expensive products" tail when the date window is narrow. | |
| orders = orders[orders["Total Amount"] > 0] | |
| orders["Month"] = orders["Order Date"].dt.tz_convert(None).dt.to_period("M").dt.to_timestamp() | |
| # The export emits multiple `Creation Date` rows per actual refund event (8x in | |
| # observed data); dedup on the natural key before summing or amounts inflate ~2x. | |
| refunds = refunds.drop_duplicates(subset=["Order ID", "Refund Date", "Refund Amount"]).copy() | |
| # Attribute each refund to its original order's date so net spend is shown | |
| # in the month the purchase was made (option b), not the month of the refund. | |
| order_date = orders.drop_duplicates("Order ID").set_index("Order ID")["Order Date"] | |
| refunds["Refund Amount"] = refunds["Refund Amount"].astype(float) | |
| refunds["Order Date"] = refunds["Order ID"].map(order_date) | |
| refunds = refunds.dropna(subset=["Order Date"]) | |
| refunds["Month"] = refunds["Order Date"].dt.tz_convert(None).dt.to_period("M").dt.to_timestamp() | |
| orders["Refunded"] = False | |
| orders.loc[list(_match_refunds_to_lines(orders, refunds)), "Refunded"] = True | |
| return orders, refunds | |