segmentx-behavioral-intelligence / pipeline /05_evaluation_visualization.py
DIVYANSHI SINGH
Initial commit: SegmentX Behavioral Intelligence Portal
72d0706
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import joblib
import os
import sys
# Add parent directory to sys.path to import path_utils
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from path_utils import SCALED_DATA_PATH, KMEANS_MODEL_PATH, OUTPUTS_DIR, CUSTOMER_SEGMENTS_PATH, CLEANED_DATA_PATH, SEGMENT_PRODUCTS_PATH
def evaluate_and_visualize():
print("Starting evaluation and visualization...")
# Load data
if not os.path.exists(SCALED_DATA_PATH) or not os.path.exists(KMEANS_MODEL_PATH):
print("Error: Required files not found. Run previous steps first.")
return
data_dict = joblib.load(SCALED_DATA_PATH)
X = data_dict['rfm_scaled']
rfm_raw = data_dict['rfm_raw']
kmeans = joblib.load(KMEANS_MODEL_PATH)
# Assign labels
cluster_labels = kmeans.labels_
rfm_raw['Cluster'] = cluster_labels
# 1. PCA for 2D Visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
pca_df = pd.DataFrame(X_pca, columns=['PCA1', 'PCA2'])
pca_df['Cluster'] = cluster_labels
plt.figure(figsize=(10, 8))
sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=pca_df, palette='viridis', alpha=0.7)
plt.title('Customer Segments PCA Visualization')
plt.savefig(os.path.join(OUTPUTS_DIR, "cluster_pca_plot.png"))
plt.close()
# 2. Compute mean RFM per cluster to assign business labels
cluster_summary = rfm_raw.groupby('Cluster').agg({
'Recency': 'mean',
'Frequency': 'mean',
'Monetary': 'mean'
}).sort_values('Monetary', ascending=False)
print("\nCluster RFM Means:")
print(cluster_summary)
# 3. Label Clusters Based on Profile
# Note: We need a dynamic way or manual based on summary.
# Usually: Highest monetary/freq + Lowest Recency = Champions
# Let's map based on the sorted summary (by Monetary primarily)
# This is a heuristic since cluster IDs can change.
# Mapping based on sorted monetary:
# Top 1: Champions
# Top 2: Loyal
# Top 3: At-Risk
# Top 4: Lost
cluster_mapping = {}
sorted_ids = cluster_summary.index.tolist()
labels = ["Champions", "Loyal Customers", "At-Risk", "Lost/Hibernating"]
for i, cid in enumerate(sorted_ids):
if i < len(labels):
cluster_mapping[cid] = labels[i]
else:
cluster_mapping[cid] = f"Other {i}"
rfm_raw['Segment'] = rfm_raw['Cluster'].map(cluster_mapping)
print("\nCluster Mapping Applied:")
for cid, label in cluster_mapping.items():
print(f"Cluster {cid} -> {label}")
# 4. RFM Heatmap
# Normalize values for better heatmap visualization (0 to 1 scaling of the means)
summary_norm = (cluster_summary - cluster_summary.min()) / (cluster_summary.max() - cluster_summary.min())
plt.figure(figsize=(10, 6))
sns.heatmap(summary_norm.T, annot=True, cmap='RdYlGn')
plt.title('Relative Behavioral Metrics by Cluster')
plt.savefig(os.path.join(OUTPUTS_DIR, "rfm_cluster_heatmap.png"))
plt.close()
# 5. Save results
rfm_raw.to_csv(CUSTOMER_SEGMENTS_PATH)
print(f"Customer segments saved to {CUSTOMER_SEGMENTS_PATH}")
# 6. Segment Product Affinity (Market Basket Analysis)
print("Calculating Segment Product Affinity...")
if os.path.exists(CLEANED_DATA_PATH):
df_clean = pd.read_csv(CLEANED_DATA_PATH)
# Merge with segment labels
df_merged = df_clean.merge(rfm_raw[['Cluster', 'Segment']], left_on='Customer ID', right_index=True)
# Calculate Top 10 Products per Segment (by Total Quantity)
top_products = df_merged.groupby(['Segment', 'Description'])['Quantity'].sum().reset_index()
top_products = top_products.sort_values(['Segment', 'Quantity'], ascending=[True, False])
# Take Top 10 for each segment
top_10_per_segment = top_products.groupby('Segment').head(10)
top_10_per_segment.to_csv(SEGMENT_PRODUCTS_PATH, index=False)
print(f"Segment product affinity saved to {SEGMENT_PRODUCTS_PATH}")
else:
print(f"Warning: Cleaned data not found at {CLEANED_DATA_PATH}. Skipping product affinity.")
if __name__ == "__main__":
evaluate_and_visualize()