| from __future__ import annotations |
|
|
| import numpy as np |
| import pandas as pd |
| from typing import Dict |
| from src.core.config_loader import Config |
|
|
|
|
| USER_TYPE_PROBS: Dict[str, float] = { |
| "customer": 0.6, |
| "merchant": 0.15, |
| "supplier": 0.05, |
| "employer": 0.1, |
| "fraudster": 0.05, |
| "mule": 0.05, |
| } |
|
|
| KYC_LEVELS = ["low", "medium", "full"] |
| KYC_PROBS = [0.2, 0.3, 0.5] |
|
|
| RISK_LEVELS = ["low", "medium", "high"] |
| RISK_PROBS = [0.6, 0.3, 0.1] |
|
|
|
|
| def _sample_user_types(n: int) -> np.ndarray: |
| types = list(USER_TYPE_PROBS.keys()) |
| probs = list(USER_TYPE_PROBS.values()) |
| return np.random.choice(types, size=n, p=probs) |
|
|
|
|
| def _sample_kyc(n: int) -> np.ndarray: |
| return np.random.choice(KYC_LEVELS, size=n, p=KYC_PROBS) |
|
|
|
|
| def _sample_risk(n: int) -> np.ndarray: |
| return np.random.choice(RISK_LEVELS, size=n, p=RISK_PROBS) |
|
|
|
|
| def generate_users(config: Config) -> pd.DataFrame: |
| n = config.num_users |
| p = config.user_params |
|
|
| user_ids = np.arange(n) |
|
|
| |
| lambda_u = np.random.lognormal( |
| mean=np.log(p.lambda_mean), |
| sigma=p.lambda_std, |
| size=n |
| ) |
|
|
| |
| mu_u = np.random.normal( |
| loc=p.mu_mean, |
| scale=p.mu_std, |
| size=n |
| ) |
|
|
| sigma_u = np.random.uniform( |
| low=max(1e-6, p.sigma_mean - p.sigma_std), |
| high=p.sigma_mean + p.sigma_std, |
| size=n |
| ) |
|
|
| |
| lambda_u = np.clip(lambda_u, 1e-6, None) |
| sigma_u = np.clip(sigma_u, 1e-6, None) |
|
|
| |
| balance = np.random.lognormal(mean=10.0, sigma=1.0, size=n) |
|
|
| user_type = _sample_user_types(n) |
| kyc_level = _sample_kyc(n) |
| risk_profile = _sample_risk(n) |
|
|
| df = pd.DataFrame({ |
| "user_id": user_ids, |
| "user_type": user_type, |
| "lambda_u": lambda_u, |
| "mu_u": mu_u, |
| "sigma_u": sigma_u, |
| "balance": balance, |
| "kyc_level": kyc_level, |
| "risk_profile": risk_profile, |
| }) |
|
|
| |
| if df.isnull().any().any(): |
| raise ValueError("NaNs detected in generated users") |
|
|
| if (df["lambda_u"] <= 0).any(): |
| raise ValueError("Invalid lambda_u values") |
|
|
| if (df["sigma_u"] <= 0).any(): |
| raise ValueError("Invalid sigma_u values") |
|
|
| return df |