Spaces:

SezarTheGreat
/

Mall-Customer-Segmentation-Updated

Runtime error

App Files Files Community

Mall-Customer-Segmentation-Updated / supervised_explainability.py

SezarTheGreat

Upload supervised_explainability.py with huggingface_hub

ad206b2 verified 6 days ago

raw

history blame contribute delete

4.17 kB

	import pandas as pd
	import numpy as np
	from sklearn.ensemble import RandomForestClassifier
	import logging
	import joblib

	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	class SegmentClassifier:
	"""
	Supervised layer treating the GMM dominant cluster (argmax) as the categorical target.
	Trains a Random Forest classifier and uses its native feature importance for explainability.
	"""

	def __init__(self, random_state: int = 42):
	self.random_state = random_state
	self.model = RandomForestClassifier(
	n_estimators=100,
	n_jobs=-1,
	random_state=self.random_state
	)

	def train_and_explain(self, X: pd.DataFrame, y: pd.Series, top_k: int = 5) -> list[str]:
	"""
	Trains the Random Forest classifier on the raw/enriched features X against cluster target y.
	Calculates feature importances and returns the top_k global features.
	"""
	logger.info(f"Training Supervised Random Forest Classifier on {X.shape[0]} samples with {X.shape[1]} features.")

	X_numeric = X.select_dtypes(include=['number']).copy()

	leak_cols = [c for c in X_numeric.columns if 'Prob' in c or 'Cluster' in c or 'id' in c.lower()]
	X_numeric = X_numeric.drop(columns=leak_cols, errors='ignore').fillna(0)

	self.model.fit(X_numeric, y)
	logger.info("Random Forest training completed.")

	logger.info("Extracting feature importances for explainability...")

	importance_df = pd.DataFrame({
	'feature': X_numeric.columns,
	'importance': self.model.feature_importances_
	}).sort_values(by='importance', ascending=False)

	top_features = importance_df.head(top_k)['feature'].tolist()

	logger.info(f"Top {top_k} Features driving segment assignment globally: {top_features}")
	return top_features

	def predict(self, X: pd.DataFrame) -> np.ndarray:
	X_numeric = X.select_dtypes(include=['number']).copy()
	leak_cols = [c for c in X_numeric.columns if 'Prob' in c or 'Cluster' in c or 'id' in c.lower()]
	X_numeric = X_numeric.drop(columns=leak_cols, errors='ignore').fillna(0)
	return self.model.predict(X_numeric)

	def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
	X_numeric = X.select_dtypes(include=['number']).copy()
	leak_cols = [c for c in X_numeric.columns if 'Prob' in c or 'Cluster' in c or 'id' in c.lower()]
	X_numeric = X_numeric.drop(columns=leak_cols, errors='ignore').fillna(0)
	return self.model.predict_proba(X_numeric)

	def explain_local(self, X_sample: pd.DataFrame, predicted_class: int) -> pd.DataFrame:
	"""
	Calculates local feature importance for a single customer sample using global importance as proxy.
	"""
	try:
	X_numeric = X_sample.select_dtypes(include=['number']).copy()
	leak_cols = [c for c in X_numeric.columns if 'Prob' in c or 'Cluster' in c or 'id' in c.lower()]
	X_numeric = X_numeric.drop(columns=leak_cols, errors='ignore').fillna(0)

	global_importance = self.model.feature_importances_

	importance_df = pd.DataFrame({
	'feature': X_numeric.columns,
	'importance_value': global_importance
	})

	importance_df['abs_importance'] = importance_df['importance_value'].abs()
	importance_df = importance_df.sort_values(by='abs_importance', ascending=False).drop(columns=['abs_importance'])
	return importance_df

	except Exception as e:
	logger.error(f"Failed to generate random forest explanations: {e}")
	return pd.DataFrame(columns=['feature', 'importance_value'])

	def save_model(self, filepath: str):
	joblib.dump(self.model, filepath)

	@classmethod
	def load_model(cls, filepath: str):
	instance = cls()
	instance.model = joblib.load(filepath)
	return instance