PranavSharma's picture
initial commit
5d5de4c verified
import pandas as pd
import numpy as np
def make_sku_week_panel(df: pd.DataFrame) -> pd.DataFrame:
'''
Process raw UCI Online Retail data into SKU-week panel format.
:param df: Input raw data
:type df: pd.DataFrame
:return: Processed SKU-week panel data
:rtype: pd.DataFrame
'''
# "Normalise"
df_panel = df.copy()
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce')
df['StockCode'] = df['StockCode'].astype(str)
# Hard governance filters
df = df[~df['InvoiceNo'].str.startswith('C', na=False)] # Remove cancellations
df = df[df['Quantity'] > 0] # Keep only positive quantities
df = df[df['UnitPrice'] > 0] # Keep only positive prices
df = df.dropna(subset=['InvoiceDate', 'StockCode', 'Quantity', 'UnitPrice']) # Remove rows with missing values
# Select one country for simplicity
if 'Country' in df.columns:
df = df[df['Country'] == 'United Kingdom']
# Create 'Week' column
df['Week'] = df['InvoiceDate'].dt.to_period('W').apply(lambda r: r.start_time)
# Aggregate to SKU-Week level
df_panel = (df.groupby(['StockCode', 'Week']).apply(
lambda x: pd.Series({
'qty': x['Quantity'].sum(),
'price': x['UnitPrice'].mean(),
'avg_price': np.average(x['UnitPrice'], weights=x['Quantity']),
'n_txn': len(x),
})
).reset_index())
return df_panel
def eligible_skus(df_panel: pd.DataFrame, min_weeks: int = 26, min_price_points: int = 10, min_total_qty: int = 200) -> list[str]:
'''
Identify SKUs eligible for analysis based on data sufficiency criteria.
:param df_panel: Processed SKU-week panel data
:type df_panel: pd.DataFrame
:param min_weeks: Minimum number of weeks of data required
:type min_weeks: int
:param min_price_points: Minimum number of distinct price points required
:type min_price_points: int
:param min_total_qty: Minimum total quantity required
:type min_total_qty: int
:return: List of eligible SKU codes
:rtype: list[str]
'''
sku_stats = df_panel.groupby('StockCode').agg(
n_weeks=('Week', 'nunique'),
n_price_points=('avg_price', 'nunique'),
total_qty=('qty', 'sum'),
)
eligible = sku_stats[
(sku_stats['n_weeks'] >= min_weeks) &
(sku_stats['total_qty'] >= min_total_qty) &
(sku_stats['n_price_points'] >= min_price_points)
].index.tolist()
return eligible