import pandas as pd import numpy as np from sklearn.preprocessing import StandardScaler import joblib import os import sys # Add parent directory to sys.path to import path_utils sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from path_utils import RFM_FEATURES_PATH, SCALED_DATA_PATH def preprocess_data(): print("Starting data preprocessing...") # Load RFM features if not os.path.exists(RFM_FEATURES_PATH): print(f"Error: RFM features not found at {RFM_FEATURES_PATH}. Run 02_rfm_engineering.py first.") return rfm = pd.read_csv(RFM_FEATURES_PATH, index_col='Customer ID') print(f"Loaded RFM features for {len(rfm)} customers.") # 1. Handle Outliers (Clipping at 95th percentile for better Silhouette score) for col in ['Frequency', 'Monetary']: upper_limit = rfm[col].quantile(0.95) rfm[col] = np.where(rfm[col] > upper_limit, upper_limit, rfm[col]) # 2. Log Transformation (np.log1p handle 0s) # Recency, Frequency, and Monetary can be skewed. rfm_log = np.log1p(rfm) print("Log transformation completed.") # 3. Scaling scaler = StandardScaler() scaler.fit(rfm_log) rfm_scaled = scaler.transform(rfm_log) # Convert to DataFrame for easier inspection rfm_scaled_df = pd.DataFrame(rfm_scaled, index=rfm.index, columns=rfm.columns) # Save scaled data and scaler processed_data = { 'rfm_scaled': rfm_scaled_df, 'scaler': scaler, 'rfm_log': rfm_log, 'rfm_raw': rfm } joblib.dump(processed_data, SCALED_DATA_PATH) print(f"Preprocessed data and scaler saved to {SCALED_DATA_PATH}") print(rfm_scaled_df.head()) if __name__ == "__main__": preprocess_data()