import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.decomposition import PCA import joblib import os import sys # Add parent directory to sys.path to import path_utils sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from path_utils import SCALED_DATA_PATH, KMEANS_MODEL_PATH, OUTPUTS_DIR, CUSTOMER_SEGMENTS_PATH, CLEANED_DATA_PATH, SEGMENT_PRODUCTS_PATH def evaluate_and_visualize(): print("Starting evaluation and visualization...") # Load data if not os.path.exists(SCALED_DATA_PATH) or not os.path.exists(KMEANS_MODEL_PATH): print("Error: Required files not found. Run previous steps first.") return data_dict = joblib.load(SCALED_DATA_PATH) X = data_dict['rfm_scaled'] rfm_raw = data_dict['rfm_raw'] kmeans = joblib.load(KMEANS_MODEL_PATH) # Assign labels cluster_labels = kmeans.labels_ rfm_raw['Cluster'] = cluster_labels # 1. PCA for 2D Visualization pca = PCA(n_components=2) X_pca = pca.fit_transform(X) pca_df = pd.DataFrame(X_pca, columns=['PCA1', 'PCA2']) pca_df['Cluster'] = cluster_labels plt.figure(figsize=(10, 8)) sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=pca_df, palette='viridis', alpha=0.7) plt.title('Customer Segments PCA Visualization') plt.savefig(os.path.join(OUTPUTS_DIR, "cluster_pca_plot.png")) plt.close() # 2. Compute mean RFM per cluster to assign business labels cluster_summary = rfm_raw.groupby('Cluster').agg({ 'Recency': 'mean', 'Frequency': 'mean', 'Monetary': 'mean' }).sort_values('Monetary', ascending=False) print("\nCluster RFM Means:") print(cluster_summary) # 3. Label Clusters Based on Profile # Note: We need a dynamic way or manual based on summary. # Usually: Highest monetary/freq + Lowest Recency = Champions # Let's map based on the sorted summary (by Monetary primarily) # This is a heuristic since cluster IDs can change. # Mapping based on sorted monetary: # Top 1: Champions # Top 2: Loyal # Top 3: At-Risk # Top 4: Lost cluster_mapping = {} sorted_ids = cluster_summary.index.tolist() labels = ["Champions", "Loyal Customers", "At-Risk", "Lost/Hibernating"] for i, cid in enumerate(sorted_ids): if i < len(labels): cluster_mapping[cid] = labels[i] else: cluster_mapping[cid] = f"Other {i}" rfm_raw['Segment'] = rfm_raw['Cluster'].map(cluster_mapping) print("\nCluster Mapping Applied:") for cid, label in cluster_mapping.items(): print(f"Cluster {cid} -> {label}") # 4. RFM Heatmap # Normalize values for better heatmap visualization (0 to 1 scaling of the means) summary_norm = (cluster_summary - cluster_summary.min()) / (cluster_summary.max() - cluster_summary.min()) plt.figure(figsize=(10, 6)) sns.heatmap(summary_norm.T, annot=True, cmap='RdYlGn') plt.title('Relative Behavioral Metrics by Cluster') plt.savefig(os.path.join(OUTPUTS_DIR, "rfm_cluster_heatmap.png")) plt.close() # 5. Save results rfm_raw.to_csv(CUSTOMER_SEGMENTS_PATH) print(f"Customer segments saved to {CUSTOMER_SEGMENTS_PATH}") # 6. Segment Product Affinity (Market Basket Analysis) print("Calculating Segment Product Affinity...") if os.path.exists(CLEANED_DATA_PATH): df_clean = pd.read_csv(CLEANED_DATA_PATH) # Merge with segment labels df_merged = df_clean.merge(rfm_raw[['Cluster', 'Segment']], left_on='Customer ID', right_index=True) # Calculate Top 10 Products per Segment (by Total Quantity) top_products = df_merged.groupby(['Segment', 'Description'])['Quantity'].sum().reset_index() top_products = top_products.sort_values(['Segment', 'Quantity'], ascending=[True, False]) # Take Top 10 for each segment top_10_per_segment = top_products.groupby('Segment').head(10) top_10_per_segment.to_csv(SEGMENT_PRODUCTS_PATH, index=False) print(f"Segment product affinity saved to {SEGMENT_PRODUCTS_PATH}") else: print(f"Warning: Cleaned data not found at {CLEANED_DATA_PATH}. Skipping product affinity.") if __name__ == "__main__": evaluate_and_visualize()