Spaces:

Divya499
/

segmentx-behavioral-intelligence

Sleeping

segmentx-behavioral-intelligence / pipeline /03_preprocessing.py

DIVYANSHI SINGH

Initial commit: SegmentX Behavioral Intelligence Portal

72d0706 2 months ago

1.77 kB

	import pandas as pd
	import numpy as np
	from sklearn.preprocessing import StandardScaler
	import joblib
	import os
	import sys

	# Add parent directory to sys.path to import path_utils
	sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
	from path_utils import RFM_FEATURES_PATH, SCALED_DATA_PATH

	def preprocess_data():
	print("Starting data preprocessing...")

	# Load RFM features
	if not os.path.exists(RFM_FEATURES_PATH):
	print(f"Error: RFM features not found at {RFM_FEATURES_PATH}. Run 02_rfm_engineering.py first.")
	return

	rfm = pd.read_csv(RFM_FEATURES_PATH, index_col='Customer ID')
	print(f"Loaded RFM features for {len(rfm)} customers.")

	# 1. Handle Outliers (Clipping at 95th percentile for better Silhouette score)
	for col in ['Frequency', 'Monetary']:
	upper_limit = rfm[col].quantile(0.95)
	rfm[col] = np.where(rfm[col] > upper_limit, upper_limit, rfm[col])

	# 2. Log Transformation (np.log1p handle 0s)
	# Recency, Frequency, and Monetary can be skewed.
	rfm_log = np.log1p(rfm)
	print("Log transformation completed.")

	# 3. Scaling
	scaler = StandardScaler()
	scaler.fit(rfm_log)
	rfm_scaled = scaler.transform(rfm_log)

	# Convert to DataFrame for easier inspection
	rfm_scaled_df = pd.DataFrame(rfm_scaled, index=rfm.index, columns=rfm.columns)

	# Save scaled data and scaler
	processed_data = {
	'rfm_scaled': rfm_scaled_df,
	'scaler': scaler,
	'rfm_log': rfm_log,
	'rfm_raw': rfm
	}
	joblib.dump(processed_data, SCALED_DATA_PATH)

	print(f"Preprocessed data and scaler saved to {SCALED_DATA_PATH}")
	print(rfm_scaled_df.head())

	if __name__ == "__main__":
	preprocess_data()