|
|
import pandas as pd
|
|
|
import numpy as np
|
|
|
|
|
|
def make_sku_week_panel(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
'''
|
|
|
Process raw UCI Online Retail data into SKU-week panel format.
|
|
|
|
|
|
:param df: Input raw data
|
|
|
:type df: pd.DataFrame
|
|
|
:return: Processed SKU-week panel data
|
|
|
:rtype: pd.DataFrame
|
|
|
'''
|
|
|
|
|
|
df_panel = df.copy()
|
|
|
|
|
|
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce')
|
|
|
df['StockCode'] = df['StockCode'].astype(str)
|
|
|
|
|
|
|
|
|
df = df[~df['InvoiceNo'].str.startswith('C', na=False)]
|
|
|
df = df[df['Quantity'] > 0]
|
|
|
df = df[df['UnitPrice'] > 0]
|
|
|
df = df.dropna(subset=['InvoiceDate', 'StockCode', 'Quantity', 'UnitPrice'])
|
|
|
|
|
|
|
|
|
if 'Country' in df.columns:
|
|
|
df = df[df['Country'] == 'United Kingdom']
|
|
|
|
|
|
|
|
|
df['Week'] = df['InvoiceDate'].dt.to_period('W').apply(lambda r: r.start_time)
|
|
|
|
|
|
|
|
|
df_panel = (df.groupby(['StockCode', 'Week']).apply(
|
|
|
lambda x: pd.Series({
|
|
|
'qty': x['Quantity'].sum(),
|
|
|
'price': x['UnitPrice'].mean(),
|
|
|
'avg_price': np.average(x['UnitPrice'], weights=x['Quantity']),
|
|
|
'n_txn': len(x),
|
|
|
})
|
|
|
).reset_index())
|
|
|
|
|
|
return df_panel
|
|
|
|
|
|
|
|
|
def eligible_skus(df_panel: pd.DataFrame, min_weeks: int = 26, min_price_points: int = 10, min_total_qty: int = 200) -> list[str]:
|
|
|
'''
|
|
|
Identify SKUs eligible for analysis based on data sufficiency criteria.
|
|
|
|
|
|
:param df_panel: Processed SKU-week panel data
|
|
|
:type df_panel: pd.DataFrame
|
|
|
:param min_weeks: Minimum number of weeks of data required
|
|
|
:type min_weeks: int
|
|
|
:param min_price_points: Minimum number of distinct price points required
|
|
|
:type min_price_points: int
|
|
|
:param min_total_qty: Minimum total quantity required
|
|
|
:type min_total_qty: int
|
|
|
:return: List of eligible SKU codes
|
|
|
:rtype: list[str]
|
|
|
'''
|
|
|
sku_stats = df_panel.groupby('StockCode').agg(
|
|
|
n_weeks=('Week', 'nunique'),
|
|
|
n_price_points=('avg_price', 'nunique'),
|
|
|
total_qty=('qty', 'sum'),
|
|
|
)
|
|
|
eligible = sku_stats[
|
|
|
(sku_stats['n_weeks'] >= min_weeks) &
|
|
|
(sku_stats['total_qty'] >= min_total_qty) &
|
|
|
(sku_stats['n_price_points'] >= min_price_points)
|
|
|
].index.tolist()
|
|
|
return eligible
|
|
|
|