| import pandas as pd |
| import numpy as np |
| import matplotlib.pyplot as plt |
| import seaborn as sns |
| from sklearn.decomposition import PCA |
| import joblib |
| import os |
| import sys |
|
|
| |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| from path_utils import SCALED_DATA_PATH, KMEANS_MODEL_PATH, OUTPUTS_DIR, CUSTOMER_SEGMENTS_PATH, CLEANED_DATA_PATH, SEGMENT_PRODUCTS_PATH |
|
|
| def evaluate_and_visualize(): |
| print("Starting evaluation and visualization...") |
| |
| |
| if not os.path.exists(SCALED_DATA_PATH) or not os.path.exists(KMEANS_MODEL_PATH): |
| print("Error: Required files not found. Run previous steps first.") |
| return |
|
|
| data_dict = joblib.load(SCALED_DATA_PATH) |
| X = data_dict['rfm_scaled'] |
| rfm_raw = data_dict['rfm_raw'] |
| kmeans = joblib.load(KMEANS_MODEL_PATH) |
| |
| |
| cluster_labels = kmeans.labels_ |
| rfm_raw['Cluster'] = cluster_labels |
| |
| |
| pca = PCA(n_components=2) |
| X_pca = pca.fit_transform(X) |
| pca_df = pd.DataFrame(X_pca, columns=['PCA1', 'PCA2']) |
| pca_df['Cluster'] = cluster_labels |
|
|
| plt.figure(figsize=(10, 8)) |
| sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=pca_df, palette='viridis', alpha=0.7) |
| plt.title('Customer Segments PCA Visualization') |
| plt.savefig(os.path.join(OUTPUTS_DIR, "cluster_pca_plot.png")) |
| plt.close() |
|
|
| |
| cluster_summary = rfm_raw.groupby('Cluster').agg({ |
| 'Recency': 'mean', |
| 'Frequency': 'mean', |
| 'Monetary': 'mean' |
| }).sort_values('Monetary', ascending=False) |
| |
| print("\nCluster RFM Means:") |
| print(cluster_summary) |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| cluster_mapping = {} |
| sorted_ids = cluster_summary.index.tolist() |
| labels = ["Champions", "Loyal Customers", "At-Risk", "Lost/Hibernating"] |
| |
| for i, cid in enumerate(sorted_ids): |
| if i < len(labels): |
| cluster_mapping[cid] = labels[i] |
| else: |
| cluster_mapping[cid] = f"Other {i}" |
|
|
| rfm_raw['Segment'] = rfm_raw['Cluster'].map(cluster_mapping) |
| print("\nCluster Mapping Applied:") |
| for cid, label in cluster_mapping.items(): |
| print(f"Cluster {cid} -> {label}") |
|
|
| |
| |
| summary_norm = (cluster_summary - cluster_summary.min()) / (cluster_summary.max() - cluster_summary.min()) |
| plt.figure(figsize=(10, 6)) |
| sns.heatmap(summary_norm.T, annot=True, cmap='RdYlGn') |
| plt.title('Relative Behavioral Metrics by Cluster') |
| plt.savefig(os.path.join(OUTPUTS_DIR, "rfm_cluster_heatmap.png")) |
| plt.close() |
|
|
| |
| rfm_raw.to_csv(CUSTOMER_SEGMENTS_PATH) |
| print(f"Customer segments saved to {CUSTOMER_SEGMENTS_PATH}") |
|
|
| |
| print("Calculating Segment Product Affinity...") |
| if os.path.exists(CLEANED_DATA_PATH): |
| df_clean = pd.read_csv(CLEANED_DATA_PATH) |
| |
| df_merged = df_clean.merge(rfm_raw[['Cluster', 'Segment']], left_on='Customer ID', right_index=True) |
| |
| |
| top_products = df_merged.groupby(['Segment', 'Description'])['Quantity'].sum().reset_index() |
| top_products = top_products.sort_values(['Segment', 'Quantity'], ascending=[True, False]) |
| |
| |
| top_10_per_segment = top_products.groupby('Segment').head(10) |
| top_10_per_segment.to_csv(SEGMENT_PRODUCTS_PATH, index=False) |
| print(f"Segment product affinity saved to {SEGMENT_PRODUCTS_PATH}") |
| else: |
| print(f"Warning: Cleaned data not found at {CLEANED_DATA_PATH}. Skipping product affinity.") |
|
|
| if __name__ == "__main__": |
| evaluate_and_visualize() |
|
|