File size: 2,575 Bytes
5d5de4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import pandas as pd 
import numpy as np 

def make_sku_week_panel(df: pd.DataFrame) -> pd.DataFrame:
    '''

    Process raw UCI Online Retail data into SKU-week panel format.

    

    :param df: Input raw data

    :type df: pd.DataFrame  

    :return: Processed SKU-week panel data

    :rtype: pd.DataFrame

    '''
    # "Normalise"
    df_panel = df.copy()

    df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce')
    df['StockCode'] = df['StockCode'].astype(str)

    # Hard governance filters
    df = df[~df['InvoiceNo'].str.startswith('C', na=False)]  # Remove cancellations
    df = df[df['Quantity'] > 0]  # Keep only positive quantities
    df = df[df['UnitPrice'] > 0]  # Keep only positive prices
    df = df.dropna(subset=['InvoiceDate', 'StockCode', 'Quantity', 'UnitPrice'])  # Remove rows with missing values

    # Select one country for simplicity
    if 'Country' in df.columns:
        df = df[df['Country'] == 'United Kingdom']
    
    # Create 'Week' column
    df['Week'] = df['InvoiceDate'].dt.to_period('W').apply(lambda r: r.start_time)

    # Aggregate to SKU-Week level
    df_panel = (df.groupby(['StockCode', 'Week']).apply(
        lambda x: pd.Series({
            'qty': x['Quantity'].sum(),
            'price': x['UnitPrice'].mean(),
            'avg_price': np.average(x['UnitPrice'], weights=x['Quantity']),
            'n_txn': len(x),
        })
    ).reset_index())

    return df_panel


def eligible_skus(df_panel: pd.DataFrame, min_weeks: int = 26, min_price_points: int = 10, min_total_qty: int = 200) -> list[str]:
    '''

    Identify SKUs eligible for analysis based on data sufficiency criteria.



    :param df_panel: Processed SKU-week panel data  

    :type df_panel: pd.DataFrame

    :param min_weeks: Minimum number of weeks of data required

    :type min_weeks: int

    :param min_price_points: Minimum number of distinct price points required

    :type min_price_points: int

    :param min_total_qty: Minimum total quantity required

    :type min_total_qty: int

    :return: List of eligible SKU codes

    :rtype: list[str]

    '''
    sku_stats = df_panel.groupby('StockCode').agg(
        n_weeks=('Week', 'nunique'),
        n_price_points=('avg_price', 'nunique'),
        total_qty=('qty', 'sum'),
    )
    eligible = sku_stats[
        (sku_stats['n_weeks'] >= min_weeks) &
        (sku_stats['total_qty'] >= min_total_qty) &
        (sku_stats['n_price_points'] >= min_price_points)
    ].index.tolist()
    return eligible