| import pandas as pd |
| import numpy as np |
| from sklearn.preprocessing import StandardScaler |
| import joblib |
| import os |
| import sys |
|
|
| |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| from path_utils import RFM_FEATURES_PATH, SCALED_DATA_PATH |
|
|
| def preprocess_data(): |
| print("Starting data preprocessing...") |
| |
| |
| if not os.path.exists(RFM_FEATURES_PATH): |
| print(f"Error: RFM features not found at {RFM_FEATURES_PATH}. Run 02_rfm_engineering.py first.") |
| return |
|
|
| rfm = pd.read_csv(RFM_FEATURES_PATH, index_col='Customer ID') |
| print(f"Loaded RFM features for {len(rfm)} customers.") |
|
|
| |
| for col in ['Frequency', 'Monetary']: |
| upper_limit = rfm[col].quantile(0.95) |
| rfm[col] = np.where(rfm[col] > upper_limit, upper_limit, rfm[col]) |
| |
| |
| |
| rfm_log = np.log1p(rfm) |
| print("Log transformation completed.") |
|
|
| |
| scaler = StandardScaler() |
| scaler.fit(rfm_log) |
| rfm_scaled = scaler.transform(rfm_log) |
| |
| |
| rfm_scaled_df = pd.DataFrame(rfm_scaled, index=rfm.index, columns=rfm.columns) |
| |
| |
| processed_data = { |
| 'rfm_scaled': rfm_scaled_df, |
| 'scaler': scaler, |
| 'rfm_log': rfm_log, |
| 'rfm_raw': rfm |
| } |
| joblib.dump(processed_data, SCALED_DATA_PATH) |
| |
| print(f"Preprocessed data and scaler saved to {SCALED_DATA_PATH}") |
| print(rfm_scaled_df.head()) |
|
|
| if __name__ == "__main__": |
| preprocess_data() |
|
|