Spaces:
Sleeping
Sleeping
| """KMeans customer segmentation model.""" | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.cluster import KMeans | |
| from sklearn.metrics import silhouette_score | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.preprocessing import StandardScaler | |
| SEGMENT_NAMES = { | |
| 0: "High Value", | |
| 1: "Loyal", | |
| 2: "Occasional", | |
| 3: "At Risk", | |
| 4: "Lost", | |
| } | |
| FEATURE_COLS = [ | |
| "recency_days", | |
| "frequency", | |
| "monetary", | |
| "avg_order_value", | |
| "tenure_days", | |
| "avg_days_between_orders", | |
| ] | |
| class CustomerSegmentationModel: | |
| def __init__(self, n_clusters: int = 5, random_state: int = 42) -> None: | |
| self.n_clusters = n_clusters | |
| self.random_state = random_state | |
| self.pipeline: Pipeline | None = None | |
| def build_feature_matrix( | |
| self, rfm_df: pd.DataFrame, clv_df: pd.DataFrame | |
| ) -> tuple[pd.DataFrame, np.ndarray]: | |
| """Merge RFM + CLV features and return (index_df, feature_matrix).""" | |
| merged = rfm_df[["customer_unique_id"] + FEATURE_COLS[:3]].merge( | |
| clv_df[["customer_unique_id", "avg_order_value", "tenure_days", "avg_days_between_orders"]], | |
| on="customer_unique_id", | |
| how="inner", | |
| ) | |
| merged = merged.dropna(subset=FEATURE_COLS) | |
| X = merged[FEATURE_COLS].values | |
| return merged[["customer_unique_id"]], X | |
| def fit(self, X: np.ndarray) -> "CustomerSegmentationModel": | |
| self.pipeline = Pipeline( | |
| [ | |
| ("scaler", StandardScaler()), | |
| ("kmeans", KMeans(n_clusters=self.n_clusters, random_state=self.random_state, n_init=10)), | |
| ] | |
| ) | |
| self.pipeline.fit(X) | |
| return self | |
| def predict(self, X: np.ndarray) -> np.ndarray: | |
| if self.pipeline is None: | |
| raise RuntimeError("Model not fitted. Call fit() first.") | |
| return self.pipeline.predict(X) | |
| def evaluate(self, X: np.ndarray) -> dict: | |
| labels = self.predict(X) | |
| X_scaled = self.pipeline.named_steps["scaler"].transform(X) | |
| return { | |
| "silhouette_score": float(silhouette_score(X_scaled, labels, sample_size=10000)), | |
| "inertia": float(self.pipeline.named_steps["kmeans"].inertia_), | |
| "n_clusters": self.n_clusters, | |
| } | |
| def get_segment_name(self, cluster_id: int) -> str: | |
| return SEGMENT_NAMES.get(cluster_id, f"Segment {cluster_id}") | |