Spaces:

Divya499
/

segmentx-behavioral-intelligence

Sleeping

segmentx-behavioral-intelligence / pipeline /05_evaluation_visualization.py

DIVYANSHI SINGH

Initial commit: SegmentX Behavioral Intelligence Portal

72d0706 2 months ago

4.36 kB

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	from sklearn.decomposition import PCA
	import joblib
	import os
	import sys

	# Add parent directory to sys.path to import path_utils
	sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
	from path_utils import SCALED_DATA_PATH, KMEANS_MODEL_PATH, OUTPUTS_DIR, CUSTOMER_SEGMENTS_PATH, CLEANED_DATA_PATH, SEGMENT_PRODUCTS_PATH

	def evaluate_and_visualize():
	print("Starting evaluation and visualization...")

	# Load data
	if not os.path.exists(SCALED_DATA_PATH) or not os.path.exists(KMEANS_MODEL_PATH):
	print("Error: Required files not found. Run previous steps first.")
	return

	data_dict = joblib.load(SCALED_DATA_PATH)
	X = data_dict['rfm_scaled']
	rfm_raw = data_dict['rfm_raw']
	kmeans = joblib.load(KMEANS_MODEL_PATH)

	# Assign labels
	cluster_labels = kmeans.labels_
	rfm_raw['Cluster'] = cluster_labels

	# 1. PCA for 2D Visualization
	pca = PCA(n_components=2)
	X_pca = pca.fit_transform(X)
	pca_df = pd.DataFrame(X_pca, columns=['PCA1', 'PCA2'])
	pca_df['Cluster'] = cluster_labels

	plt.figure(figsize=(10, 8))
	sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=pca_df, palette='viridis', alpha=0.7)
	plt.title('Customer Segments PCA Visualization')
	plt.savefig(os.path.join(OUTPUTS_DIR, "cluster_pca_plot.png"))
	plt.close()

	# 2. Compute mean RFM per cluster to assign business labels
	cluster_summary = rfm_raw.groupby('Cluster').agg({
	'Recency': 'mean',
	'Frequency': 'mean',
	'Monetary': 'mean'
	}).sort_values('Monetary', ascending=False)

	print("\nCluster RFM Means:")
	print(cluster_summary)

	# 3. Label Clusters Based on Profile
	# Note: We need a dynamic way or manual based on summary.
	# Usually: Highest monetary/freq + Lowest Recency = Champions
	# Let's map based on the sorted summary (by Monetary primarily)
	# This is a heuristic since cluster IDs can change.

	# Mapping based on sorted monetary:
	# Top 1: Champions
	# Top 2: Loyal
	# Top 3: At-Risk
	# Top 4: Lost

	cluster_mapping = {}
	sorted_ids = cluster_summary.index.tolist()
	labels = ["Champions", "Loyal Customers", "At-Risk", "Lost/Hibernating"]

	for i, cid in enumerate(sorted_ids):
	if i < len(labels):
	cluster_mapping[cid] = labels[i]
	else:
	cluster_mapping[cid] = f"Other {i}"

	rfm_raw['Segment'] = rfm_raw['Cluster'].map(cluster_mapping)
	print("\nCluster Mapping Applied:")
	for cid, label in cluster_mapping.items():
	print(f"Cluster {cid} -> {label}")

	# 4. RFM Heatmap
	# Normalize values for better heatmap visualization (0 to 1 scaling of the means)
	summary_norm = (cluster_summary - cluster_summary.min()) / (cluster_summary.max() - cluster_summary.min())
	plt.figure(figsize=(10, 6))
	sns.heatmap(summary_norm.T, annot=True, cmap='RdYlGn')
	plt.title('Relative Behavioral Metrics by Cluster')
	plt.savefig(os.path.join(OUTPUTS_DIR, "rfm_cluster_heatmap.png"))
	plt.close()

	# 5. Save results
	rfm_raw.to_csv(CUSTOMER_SEGMENTS_PATH)
	print(f"Customer segments saved to {CUSTOMER_SEGMENTS_PATH}")

	# 6. Segment Product Affinity (Market Basket Analysis)
	print("Calculating Segment Product Affinity...")
	if os.path.exists(CLEANED_DATA_PATH):
	df_clean = pd.read_csv(CLEANED_DATA_PATH)
	# Merge with segment labels
	df_merged = df_clean.merge(rfm_raw[['Cluster', 'Segment']], left_on='Customer ID', right_index=True)

	# Calculate Top 10 Products per Segment (by Total Quantity)
	top_products = df_merged.groupby(['Segment', 'Description'])['Quantity'].sum().reset_index()
	top_products = top_products.sort_values(['Segment', 'Quantity'], ascending=[True, False])

	# Take Top 10 for each segment
	top_10_per_segment = top_products.groupby('Segment').head(10)
	top_10_per_segment.to_csv(SEGMENT_PRODUCTS_PATH, index=False)
	print(f"Segment product affinity saved to {SEGMENT_PRODUCTS_PATH}")
	else:
	print(f"Warning: Cleaned data not found at {CLEANED_DATA_PATH}. Skipping product affinity.")

	if __name__ == "__main__":
	evaluate_and_visualize()