import pandas as pd import numpy as np def make_sku_week_panel(df: pd.DataFrame) -> pd.DataFrame: ''' Process raw UCI Online Retail data into SKU-week panel format. :param df: Input raw data :type df: pd.DataFrame :return: Processed SKU-week panel data :rtype: pd.DataFrame ''' # "Normalise" df_panel = df.copy() df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce') df['StockCode'] = df['StockCode'].astype(str) # Hard governance filters df = df[~df['InvoiceNo'].str.startswith('C', na=False)] # Remove cancellations df = df[df['Quantity'] > 0] # Keep only positive quantities df = df[df['UnitPrice'] > 0] # Keep only positive prices df = df.dropna(subset=['InvoiceDate', 'StockCode', 'Quantity', 'UnitPrice']) # Remove rows with missing values # Select one country for simplicity if 'Country' in df.columns: df = df[df['Country'] == 'United Kingdom'] # Create 'Week' column df['Week'] = df['InvoiceDate'].dt.to_period('W').apply(lambda r: r.start_time) # Aggregate to SKU-Week level df_panel = (df.groupby(['StockCode', 'Week']).apply( lambda x: pd.Series({ 'qty': x['Quantity'].sum(), 'price': x['UnitPrice'].mean(), 'avg_price': np.average(x['UnitPrice'], weights=x['Quantity']), 'n_txn': len(x), }) ).reset_index()) return df_panel def eligible_skus(df_panel: pd.DataFrame, min_weeks: int = 26, min_price_points: int = 10, min_total_qty: int = 200) -> list[str]: ''' Identify SKUs eligible for analysis based on data sufficiency criteria. :param df_panel: Processed SKU-week panel data :type df_panel: pd.DataFrame :param min_weeks: Minimum number of weeks of data required :type min_weeks: int :param min_price_points: Minimum number of distinct price points required :type min_price_points: int :param min_total_qty: Minimum total quantity required :type min_total_qty: int :return: List of eligible SKU codes :rtype: list[str] ''' sku_stats = df_panel.groupby('StockCode').agg( n_weeks=('Week', 'nunique'), n_price_points=('avg_price', 'nunique'), total_qty=('qty', 'sum'), ) eligible = sku_stats[ (sku_stats['n_weeks'] >= min_weeks) & (sku_stats['total_qty'] >= min_total_qty) & (sku_stats['n_price_points'] >= min_price_points) ].index.tolist() return eligible