File size: 1,774 Bytes
72d0706
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import joblib
import os
import sys

# Add parent directory to sys.path to import path_utils
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from path_utils import RFM_FEATURES_PATH, SCALED_DATA_PATH

def preprocess_data():
    print("Starting data preprocessing...")
    
    # Load RFM features
    if not os.path.exists(RFM_FEATURES_PATH):
        print(f"Error: RFM features not found at {RFM_FEATURES_PATH}. Run 02_rfm_engineering.py first.")
        return

    rfm = pd.read_csv(RFM_FEATURES_PATH, index_col='Customer ID')
    print(f"Loaded RFM features for {len(rfm)} customers.")

    # 1. Handle Outliers (Clipping at 95th percentile for better Silhouette score)
    for col in ['Frequency', 'Monetary']:
        upper_limit = rfm[col].quantile(0.95)
        rfm[col] = np.where(rfm[col] > upper_limit, upper_limit, rfm[col])
    
    # 2. Log Transformation (np.log1p handle 0s)
    # Recency, Frequency, and Monetary can be skewed.
    rfm_log = np.log1p(rfm)
    print("Log transformation completed.")

    # 3. Scaling
    scaler = StandardScaler()
    scaler.fit(rfm_log)
    rfm_scaled = scaler.transform(rfm_log)
    
    # Convert to DataFrame for easier inspection
    rfm_scaled_df = pd.DataFrame(rfm_scaled, index=rfm.index, columns=rfm.columns)
    
    # Save scaled data and scaler
    processed_data = {
        'rfm_scaled': rfm_scaled_df,
        'scaler': scaler,
        'rfm_log': rfm_log,
        'rfm_raw': rfm
    }
    joblib.dump(processed_data, SCALED_DATA_PATH)
    
    print(f"Preprocessed data and scaler saved to {SCALED_DATA_PATH}")
    print(rfm_scaled_df.head())

if __name__ == "__main__":
    preprocess_data()