Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import joblib | |
| import os | |
| import numpy as np | |
| class ClusterPredictor: | |
| def __init__(self, model_path: str, preprocessor_path: str): | |
| self.model_path = model_path | |
| self.preprocessor_path = preprocessor_path | |
| self.model = None | |
| self.preprocessor = None | |
| self.FEATURES = [ | |
| 'tx_count', 'active_days', 'avg_tx_per_day', 'total_gas_spent', | |
| 'total_nft_buys', 'total_nft_sells', 'total_nft_volume_usd', | |
| 'unique_nfts_owned', 'dex_trades', 'avg_trade_size_usd', | |
| 'total_traded_usd', 'erc20_receive_usd', 'erc20_send_usd', | |
| 'native_balance_delta' | |
| ] | |
| self.PERSONA_MAPPING = { | |
| 0: "High-Frequency Bots / Automated Traders", | |
| 1: "High-Value NFT & Crypto Traders (Degen Whales)", | |
| 2: "Active Retail Users / Everyday Traders", | |
| 3: "Ultra-Whales / Institutional & Exchange Wallets" | |
| } | |
| self._load_artifacts() | |
| def _load_artifacts(self): | |
| """Loads the model and preprocessor from disk.""" | |
| if not os.path.exists(self.model_path): | |
| raise FileNotFoundError(f"Model file not found at {self.model_path}") | |
| if not os.path.exists(self.preprocessor_path): | |
| raise FileNotFoundError(f"Preprocessor file not found at {self.preprocessor_path}") | |
| print(f"Loading model from {self.model_path}...") | |
| self.model = joblib.load(self.model_path) | |
| print(f"Loading preprocessor from {self.preprocessor_path}...") | |
| self.preprocessor = joblib.load(self.preprocessor_path) | |
| def predict(self, data: dict | pd.DataFrame) -> dict: | |
| """ | |
| Predicts the persona for the given wallet data and provides probability scores. | |
| Args: | |
| data: A dictionary or DataFrame containing the required features. | |
| Returns: | |
| A dictionary (or list of dicts) containing: | |
| - cluster_label: The predicted cluster ID. | |
| - persona: The human-readable persona name. | |
| - probabilities: A dictionary mapping each persona to its confidence score (0-1). | |
| """ | |
| import numpy as np | |
| from scipy.special import softmax | |
| if isinstance(data, dict): | |
| df = pd.DataFrame([data]) | |
| elif isinstance(data, pd.DataFrame): | |
| df = data.copy() | |
| else: | |
| raise ValueError("Input data must be a dictionary or pandas DataFrame.") | |
| missing_cols = set(self.FEATURES) - set(df.columns) | |
| if missing_cols: | |
| raise ValueError(f"Missing required features: {missing_cols}") | |
| X = df[self.FEATURES] | |
| X_transformed = self.preprocessor.transform(X) | |
| # 1. Hard Prediction (Cluster Label) | |
| cluster_labels = self.model.predict(X_transformed) | |
| # 2. Soft Probability (Distance-based) | |
| # transform() returns distance to each cluster center | |
| distances = self.model.transform(X_transformed) | |
| # We want closer distance = higher probability. | |
| # So we take the negative distance. | |
| # We apply softmax to normalize into a probability distribution (sum=1). | |
| # Multiplying by a factor (e.g., -1 or -2) can sharpen the probabilities. | |
| # Using -1 * distance is standard for "soft k-means". | |
| probs = softmax(-distances, axis=1) | |
| results = [] | |
| for i, label in enumerate(cluster_labels): | |
| prob_dict = { | |
| self.PERSONA_MAPPING.get(c_idx, f"Cluster {c_idx}"): float(probs[i][c_idx]) | |
| for c_idx in range(probs.shape[1]) | |
| } | |
| results.append({ | |
| "cluster_label": int(label), | |
| "persona": self.PERSONA_MAPPING.get(label, "Unknown"), | |
| "probabilities": prob_dict | |
| }) | |
| if len(results) == 1: | |
| return results[0] | |
| return results | |