File size: 4,037 Bytes
db10938
fecd12b
 
 
 
 
db10938
fecd12b
1173a1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fecd12b
 
db10938
 
fecd12b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e770beb
 
 
 
 
 
fecd12b
 
 
 
 
 
 
 
 
 
 
 
 
 
1173a1a
 
 
fecd12b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import io
import zipfile

import pandas as pd
import streamlit as st

from src.constants import EXCLUDED_WEBSITES, ORDERS_CSV_ENTRY, REFUNDS_CSV_ENTRY

# Tolerance (USD) for matching a refund row to a specific line item by amount.
# Refund Details.csv has Order ID but no ASIN, so to flag the actual returned
# line in a multi-item order we match on Refund Amount ≈ line Total Amount.
# Set generously enough to absorb rounding/tax variation; tight enough to avoid
# matching cheaper items in the same order. Refunds outside this tolerance
# (partial refunds, restocking fees, shipping-only) are left un-flagged.
REFUND_MATCH_TOLERANCE = 0.5


def _match_refunds_to_lines(orders: pd.DataFrame, refunds: pd.DataFrame) -> set:
    refunded_idx: set = set()
    orders_by_id = {oid: g for oid, g in orders.groupby("Order ID")}
    for order_id, group in refunds.groupby("Order ID"):
        if order_id not in orders_by_id:
            continue
        unmatched = {idx: amt for idx, amt in orders_by_id[order_id]["Total Amount"].items()}
        # Greedy largest-first: bigger refunds are less ambiguous, so match them
        # before they get stolen by a near-tie smaller line.
        for refund_amt in sorted(group["Refund Amount"], reverse=True):
            best_idx, best_diff = None, REFUND_MATCH_TOLERANCE
            for idx, amt in unmatched.items():
                d = abs(amt - refund_amt)
                if d <= best_diff:
                    best_diff, best_idx = d, idx
            if best_idx is not None:
                refunded_idx.add(best_idx)
                del unmatched[best_idx]
    return refunded_idx


@st.cache_data
def load_data(zip_bytes: bytes) -> tuple[pd.DataFrame, pd.DataFrame]:
    with zipfile.ZipFile(io.BytesIO(zip_bytes)) as z:
        with z.open(ORDERS_CSV_ENTRY) as f:
            orders = pd.read_csv(f)
        with z.open(REFUNDS_CSV_ENTRY) as f:
            refunds = pd.read_csv(f)

    orders = orders[orders["Order Status"] != "Cancelled"]
    # Exclude in-store / grocery channels (Whole Foods is `panda01`, plus the
    # cashier-less `Amazon Go`) — they're physical-store scans, not Amazon
    # online orders, and Whole Foods produce in particular collapses badly
    # under groupby because most rows share the `_ASINLESS_` sentinel ASIN.
    orders = orders[~orders["Website"].isin(EXCLUDED_WEBSITES)].copy()
    orders["Order Date"] = pd.to_datetime(orders["Order Date"], utc=True)
    orders["Total Amount"] = (
        orders["Total Amount"].astype(str).str.replace(",", "", regex=False).astype(float)
    )
    # Drop $0 lines: item-level cancellations within a multi-item order
    # (Original Quantity = 0, order itself stayed Closed), free replacements
    # for damaged/missing goods, fully-discounted promos, and Prime
    # Try-Before-You-Buy returns. None reflect real spending and they pollute
    # the "Most expensive products" tail when the date window is narrow.
    orders = orders[orders["Total Amount"] > 0]
    orders["Month"] = orders["Order Date"].dt.tz_convert(None).dt.to_period("M").dt.to_timestamp()

    # The export emits multiple `Creation Date` rows per actual refund event (8x in
    # observed data); dedup on the natural key before summing or amounts inflate ~2x.
    refunds = refunds.drop_duplicates(subset=["Order ID", "Refund Date", "Refund Amount"]).copy()

    # Attribute each refund to its original order's date so net spend is shown
    # in the month the purchase was made (option b), not the month of the refund.
    order_date = orders.drop_duplicates("Order ID").set_index("Order ID")["Order Date"]
    refunds["Refund Amount"] = refunds["Refund Amount"].astype(float)
    refunds["Order Date"] = refunds["Order ID"].map(order_date)
    refunds = refunds.dropna(subset=["Order Date"])
    refunds["Month"] = refunds["Order Date"].dt.tz_convert(None).dt.to_period("M").dt.to_timestamp()

    orders["Refunded"] = False
    orders.loc[list(_match_refunds_to_lines(orders, refunds)), "Refunded"] = True

    return orders, refunds