File size: 1,774 Bytes
72d0706 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import joblib
import os
import sys
# Add parent directory to sys.path to import path_utils
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from path_utils import RFM_FEATURES_PATH, SCALED_DATA_PATH
def preprocess_data():
print("Starting data preprocessing...")
# Load RFM features
if not os.path.exists(RFM_FEATURES_PATH):
print(f"Error: RFM features not found at {RFM_FEATURES_PATH}. Run 02_rfm_engineering.py first.")
return
rfm = pd.read_csv(RFM_FEATURES_PATH, index_col='Customer ID')
print(f"Loaded RFM features for {len(rfm)} customers.")
# 1. Handle Outliers (Clipping at 95th percentile for better Silhouette score)
for col in ['Frequency', 'Monetary']:
upper_limit = rfm[col].quantile(0.95)
rfm[col] = np.where(rfm[col] > upper_limit, upper_limit, rfm[col])
# 2. Log Transformation (np.log1p handle 0s)
# Recency, Frequency, and Monetary can be skewed.
rfm_log = np.log1p(rfm)
print("Log transformation completed.")
# 3. Scaling
scaler = StandardScaler()
scaler.fit(rfm_log)
rfm_scaled = scaler.transform(rfm_log)
# Convert to DataFrame for easier inspection
rfm_scaled_df = pd.DataFrame(rfm_scaled, index=rfm.index, columns=rfm.columns)
# Save scaled data and scaler
processed_data = {
'rfm_scaled': rfm_scaled_df,
'scaler': scaler,
'rfm_log': rfm_log,
'rfm_raw': rfm
}
joblib.dump(processed_data, SCALED_DATA_PATH)
print(f"Preprocessed data and scaler saved to {SCALED_DATA_PATH}")
print(rfm_scaled_df.head())
if __name__ == "__main__":
preprocess_data()
|