Spaces:

soupstick
/

aml-intelligence-app

Sleeping

File size: 16,574 Bytes

cc1ad5a

from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
import sys

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

ROOT_DIR = Path(__file__).resolve().parents[1]
if str(ROOT_DIR) not in sys.path:
    sys.path.insert(0, str(ROOT_DIR))

from agents.common import DATA_DIR, SEED, ensure_runtime_dirs


TRANSACTION_CATEGORIES = ["electronics", "grocery", "wire_transfer", "restaurant", "travel"]
LOAN_PURPOSES = ["mortgage", "personal", "auto", "consolidation", "business"]
COUNTRIES = [
    "United States",
    "United Kingdom",
    "Canada",
    "United Arab Emirates",
    "Singapore",
    "Nigeria",
    "India",
    "Brazil",
    "Germany",
    "France",
]
RISK_TYPES = ["SANCTIONS", "PEP", "WATCHLIST"]


@dataclass(frozen=True)
class DatasetSummary:
    name: str
    rows: int
    positive_rate: float


def _clip(value: float, lower: float, upper: float) -> float:
    return float(np.clip(value, lower, upper))


def _clip_int(value: float, lower: int, upper: int) -> int:
    return int(np.clip(round(value), lower, upper))


def _weighted_choice(rng: np.random.Generator, values: list[str], weights: list[float]) -> str:
    normalized = np.asarray(weights, dtype=float)
    normalized = normalized / normalized.sum()
    return str(rng.choice(values, p=normalized))


def _generate_transaction_row(rng: np.random.Generator, is_fraud: int, idx: int) -> dict[str, object]:
    if is_fraud:
        if rng.random() < 0.55:
            base = float(rng.choice([500, 1000, 5000, 10000, 15000, 20000]))
            amount = _clip(base + rng.normal(0.0, max(20.0, base * 0.015)), 120.0, 50000.0)
        else:
            amount = _clip(rng.lognormal(mean=9.1, sigma=0.55), 150.0, 50000.0)
        hour_of_day = int(rng.choice([0, 1, 2, 3, 4, 5, 6, 7], p=[0.05, 0.07, 0.19, 0.22, 0.20, 0.14, 0.08, 0.05]))
        is_international = bool(rng.random() < 0.60)
        merchant_category = _weighted_choice(
            rng,
            TRANSACTION_CATEGORIES,
            [0.36, 0.06, 0.32, 0.08, 0.18],
        )
        transaction_velocity_1h = _clip_int(rng.poisson(6.5) + 1, 1, 25)
        amount_vs_avg_ratio = _clip(rng.normal(4.6, 1.2), 1.4, 12.0)
        is_new_device = bool(rng.random() < 0.70)
        distance_from_home_km = _clip(rng.lognormal(mean=6.45, sigma=0.48), 60.0, 6000.0)
        failed_attempts_before = int(rng.integers(1, 4))
        account_age_days = int(rng.integers(1, 45))
    else:
        amount = _clip(rng.lognormal(mean=4.55, sigma=0.75), 5.0, 6500.0)
        if rng.random() < 0.08:
            amount = _clip(float(rng.choice([50, 100, 200, 500])) + rng.normal(0.0, 5.0), 5.0, 6500.0)
        safe_hour_weights = np.array(
            [0.02, 0.02, 0.015, 0.015, 0.015, 0.02, 0.04, 0.06, 0.08, 0.09, 0.08, 0.07, 0.06, 0.05, 0.05, 0.05, 0.055, 0.06, 0.06, 0.055, 0.05, 0.04, 0.03, 0.02],
            dtype=float,
        )
        safe_hour_weights = safe_hour_weights / safe_hour_weights.sum()
        hour_of_day = int(rng.choice(np.arange(24), p=safe_hour_weights))
        is_international = bool(rng.random() < 0.11)
        merchant_category = _weighted_choice(
            rng,
            TRANSACTION_CATEGORIES,
            [0.12, 0.40, 0.05, 0.26, 0.17],
        )
        transaction_velocity_1h = _clip_int(rng.poisson(1.4), 0, 8)
        amount_vs_avg_ratio = _clip(rng.normal(1.15, 0.42), 0.2, 4.0)
        is_new_device = bool(rng.random() < 0.18)
        distance_from_home_km = _clip(rng.lognormal(mean=3.9, sigma=0.8), 0.5, 900.0)
        failed_attempts_before = 1 if rng.random() < 0.06 else 0
        account_age_days = int(rng.integers(30, 3651))

    if rng.random() < 0.04:
        amount_vs_avg_ratio = _clip(amount_vs_avg_ratio + rng.normal(0.0, 0.35), 0.2, 12.0)
    if rng.random() < 0.03:
        transaction_velocity_1h = _clip_int(transaction_velocity_1h + rng.integers(-1, 2), 0, 25)

    return {
        "transaction_id": f"TXN-{idx:05d}",
        "amount": round(amount, 2),
        "hour_of_day": hour_of_day,
        "is_international": is_international,
        "merchant_category": merchant_category,
        "transaction_velocity_1h": transaction_velocity_1h,
        "amount_vs_avg_ratio": round(amount_vs_avg_ratio, 3),
        "is_new_device": is_new_device,
        "distance_from_home_km": round(distance_from_home_km, 2),
        "failed_attempts_before": failed_attempts_before,
        "account_age_days": account_age_days,
        "is_fraud": int(is_fraud),
    }


def _generate_credit_row(rng: np.random.Generator, is_default: int, idx: int) -> dict[str, object]:
    if is_default:
        credit_score = _clip_int(rng.normal(545, 42), 300, 720)
        debt_to_income_ratio = _clip(rng.beta(6.8, 3.6), 0.18, 0.98)
        employment_months = _clip_int(rng.gamma(1.6, 4.0), 0, 84)
        num_open_accounts = _clip_int(rng.normal(11.5, 3.0), 1, 22)
        payment_history_missed = _clip_int(rng.poisson(3.4), 0, 9)
        loan_amount = _clip(rng.lognormal(mean=10.9, sigma=0.42), 6000.0, 125000.0)
        revolving_utilization = _clip(rng.beta(8.4, 2.2), 0.25, 0.99)
        recent_hard_inquiries = _clip_int(rng.poisson(4.0), 0, 9)
        collateral_value = _clip(rng.normal(12000.0, 9000.0), 0.0, 120000.0)
        loan_purpose = _weighted_choice(
            rng,
            LOAN_PURPOSES,
            [0.08, 0.32, 0.10, 0.34, 0.16],
        )
    else:
        credit_score = _clip_int(rng.normal(712, 58), 360, 850)
        debt_to_income_ratio = _clip(rng.beta(3.1, 7.2), 0.01, 0.82)
        employment_months = _clip_int(rng.gamma(5.8, 18.0), 1, 360)
        num_open_accounts = _clip_int(rng.normal(6.4, 2.8), 1, 18)
        payment_history_missed = _clip_int(rng.poisson(0.45), 0, 4)
        loan_amount = _clip(rng.lognormal(mean=10.55, sigma=0.52), 3000.0, 150000.0)
        revolving_utilization = _clip(rng.beta(2.2, 4.8), 0.01, 0.92)
        recent_hard_inquiries = _clip_int(rng.poisson(1.1), 0, 6)
        collateral_value = _clip(rng.normal(54000.0, 28000.0), 0.0, 300000.0)
        loan_purpose = _weighted_choice(
            rng,
            LOAN_PURPOSES,
            [0.38, 0.15, 0.18, 0.11, 0.18],
        )

    if loan_purpose == "mortgage":
        collateral_value = max(collateral_value, loan_amount * rng.uniform(0.8, 1.4))
    if loan_purpose in {"personal", "consolidation"} and not is_default:
        debt_to_income_ratio = _clip(debt_to_income_ratio + rng.normal(0.02, 0.03), 0.01, 0.82)

    return {
        "applicant_id": f"APP-{idx:05d}",
        "credit_score": credit_score,
        "debt_to_income_ratio": round(debt_to_income_ratio, 4),
        "employment_months": employment_months,
        "num_open_accounts": num_open_accounts,
        "payment_history_missed": payment_history_missed,
        "loan_amount": round(loan_amount, 2),
        "revolving_utilization": round(revolving_utilization, 4),
        "recent_hard_inquiries": recent_hard_inquiries,
        "collateral_value": round(collateral_value, 2),
        "loan_purpose": loan_purpose,
        "is_default": int(is_default),
    }


def _generate_kyc_row(rng: np.random.Generator, is_anomaly: int, idx: int) -> dict[str, object]:
    if is_anomaly:
        id_document_age_days = int(rng.choice([rng.integers(1, 7), rng.integers(7300, 9500)]))
        address_match_score = _clip(rng.beta(1.2, 6.0), 0.01, 0.45)
        name_vs_id_match_score = _clip(rng.beta(1.8, 4.8), 0.05, 0.65)
        selfie_liveness_score = _clip(rng.beta(1.5, 5.2), 0.02, 0.55)
        num_accounts_same_address = _clip_int(rng.normal(5.2, 1.4), 3, 10)
        phone_age_days = _clip_int(rng.gamma(1.8, 4.0), 1, 60)
        email_domain_risk = int(rng.choice([1, 2, 3], p=[0.10, 0.70, 0.20]))
        ip_country_vs_id_country_match = bool(rng.random() < 0.18)
        velocity_applications_7d = _clip_int(rng.normal(6.8, 2.0), 2, 16)
    else:
        id_document_age_days = _clip_int(rng.gamma(4.8, 290.0), 20, 5400)
        address_match_score = _clip(rng.beta(8.5, 1.8), 0.45, 1.0)
        name_vs_id_match_score = _clip(rng.beta(8.2, 1.6), 0.55, 1.0)
        selfie_liveness_score = _clip(rng.beta(9.0, 1.6), 0.50, 1.0)
        num_accounts_same_address = _clip_int(rng.poisson(1.2), 0, 4)
        phone_age_days = _clip_int(rng.gamma(5.5, 120.0), 15, 4000)
        email_domain_risk = int(rng.choice([1, 2, 3], p=[0.62, 0.05, 0.33]))
        ip_country_vs_id_country_match = bool(rng.random() < 0.96)
        velocity_applications_7d = _clip_int(rng.poisson(1.0), 0, 5)

    return {
        "application_id": f"KYC-{idx:05d}",
        "id_document_age_days": id_document_age_days,
        "address_match_score": round(address_match_score, 4),
        "name_vs_id_match_score": round(name_vs_id_match_score, 4),
        "selfie_liveness_score": round(selfie_liveness_score, 4),
        "num_accounts_same_address": num_accounts_same_address,
        "phone_age_days": phone_age_days,
        "email_domain_risk": email_domain_risk,
        "ip_country_vs_id_country_match": ip_country_vs_id_country_match,
        "velocity_applications_7d": velocity_applications_7d,
        "is_anomaly": int(is_anomaly),
    }


def _build_transaction_dataset() -> tuple[pd.DataFrame, pd.DataFrame]:
    rng = np.random.default_rng(SEED)
    total_rows = 12000
    fraud_count = int(total_rows * 0.08)
    labels = np.array([1] * fraud_count + [0] * (total_rows - fraud_count))
    rng.shuffle(labels)
    rows = [_generate_transaction_row(rng, int(label), idx + 1) for idx, label in enumerate(labels)]
    frame = pd.DataFrame(rows)
    train_df, test_df = train_test_split(
        frame,
        test_size=2000,
        random_state=SEED,
        stratify=frame["is_fraud"],
    )
    return train_df.sort_values("transaction_id").reset_index(drop=True), test_df.sort_values("transaction_id").reset_index(drop=True)


def _build_credit_dataset() -> tuple[pd.DataFrame, pd.DataFrame]:
    rng = np.random.default_rng(SEED + 1)
    total_rows = 10000
    default_count = int(total_rows * 0.12)
    labels = np.array([1] * default_count + [0] * (total_rows - default_count))
    rng.shuffle(labels)
    rows = [_generate_credit_row(rng, int(label), idx + 1) for idx, label in enumerate(labels)]
    frame = pd.DataFrame(rows)
    train_df, test_df = train_test_split(
        frame,
        test_size=2000,
        random_state=SEED,
        stratify=frame["is_default"],
    )
    return train_df.sort_values("applicant_id").reset_index(drop=True), test_df.sort_values("applicant_id").reset_index(drop=True)


def _build_kyc_dataset() -> tuple[pd.DataFrame, pd.DataFrame]:
    rng = np.random.default_rng(SEED + 2)
    total_rows = 6000
    anomaly_count = int(total_rows * 0.05)
    labels = np.array([1] * anomaly_count + [0] * (total_rows - anomaly_count))
    rng.shuffle(labels)
    rows = [_generate_kyc_row(rng, int(label), idx + 1) for idx, label in enumerate(labels)]
    frame = pd.DataFrame(rows)
    train_df, test_df = train_test_split(
        frame,
        test_size=1000,
        random_state=SEED,
        stratify=frame["is_anomaly"],
    )
    train_df = train_df.sort_values("application_id").reset_index(drop=True)
    test_df = test_df.sort_values("application_id").reset_index(drop=True)
    return train_df.drop(columns=["is_anomaly"]), test_df


def _alias_one(full_name: str) -> str:
    parts = full_name.split()
    if len(parts) < 2:
        return full_name
    return f"{parts[0]} {parts[-1][0]}."


def _alias_two(full_name: str) -> str:
    parts = full_name.split()
    if len(parts) < 2:
        return full_name
    return f"{parts[-1]}, {parts[0]}"


def _build_sanctions_dataset() -> pd.DataFrame:
    rng = np.random.default_rng(SEED + 3)
    ambiguous_names = [
        "John Smith",
        "Mohammed Ali",
        "Chen Wei",
        "Maria Garcia",
        "David Kim",
        "Wei Chen",
        "Ahmed Hassan",
        "Michael Brown",
        "James Johnson",
        "Aisha Khan",
        "Juan Perez",
        "Fatima Noor",
        "Priya Sharma",
        "Mohamed Hassan",
        "Carlos Silva",
        "Sarah Ahmed",
        "Yusuf Khan",
        "Omar Ali",
        "Li Wei",
        "Ana Martinez",
    ]
    common_legitimate_names = [
        "Emily Carter",
        "Olivia Turner",
        "Noah Bennett",
        "Liam Parker",
        "Mia Collins",
        "Ethan Brooks",
        "Sophia Reed",
        "Ava Morgan",
        "Lucas Hayes",
        "Charlotte Brooks",
        "Amelia Jenkins",
        "Benjamin Cooper",
        "Harper Diaz",
        "Elijah Ross",
        "Ella Murphy",
        "Grace Hughes",
        "Jack Foster",
        "Henry Price",
        "Lily Ward",
        "Mason Perry",
    ]
    first_names = [
        "Abdul", "Amina", "Carlos", "Chen", "Dmitri", "Elena", "Farah", "Grace", "Hassan", "Ivan",
        "Jamal", "Karim", "Lina", "Marta", "Nadia", "Omar", "Pavel", "Qasim", "Rania", "Sergei",
        "Tariq", "Umar", "Viktor", "Wang", "Xiu", "Yara", "Zain", "Anya", "Boris", "Celine",
        "Diego", "Ebrahim", "Fiona", "Giorgio", "Helena", "Ismail", "Jelena", "Khalid", "Leila", "Nikolai",
    ]
    last_names = [
        "Petrov", "Ivanov", "Haddad", "Rahman", "Mendoza", "Volkov", "Costa", "Akhtar", "Hussein", "Kim",
        "Zhang", "Garcia", "Morris", "Singh", "Kovacs", "Novak", "Rossi", "Dubois", "Silva", "Ibrahim",
        "Fischer", "Santos", "Ortega", "Khan", "Aliyev", "Pereira", "Muller", "Bennani", "Yilmaz", "Hassan",
        "Tan", "Lopes", "Sato", "Meyer", "Diallo", "Mensah", "Kassim", "Rahimi", "Saeed", "Ndlovu",
    ]

    names: list[str] = []
    seen = set()

    for name in ambiguous_names + common_legitimate_names:
        if name not in seen:
            names.append(name)
            seen.add(name)

    while len(names) < 500:
        full_name = f"{rng.choice(first_names)} {rng.choice(last_names)}"
        if full_name in seen:
            continue
        seen.add(full_name)
        names.append(full_name)

    rows = []
    for idx, full_name in enumerate(names[:500], start=1):
        risk_type = _weighted_choice(rng, RISK_TYPES, [0.50, 0.28, 0.22])
        if risk_type == "SANCTIONS":
            risk_score = _clip(rng.normal(0.92, 0.05), 0.75, 1.0)
        elif risk_type == "PEP":
            risk_score = _clip(rng.normal(0.76, 0.08), 0.55, 0.95)
        else:
            risk_score = _clip(rng.normal(0.62, 0.10), 0.35, 0.85)
        rows.append(
            {
                "full_name": full_name,
                "alias_1": _alias_one(full_name),
                "alias_2": _alias_two(full_name),
                "country": str(rng.choice(COUNTRIES)),
                "risk_type": risk_type,
                "risk_score": round(risk_score, 3),
                "date_added": str(pd.Timestamp("2018-01-01") + pd.to_timedelta(int(rng.integers(0, 3000)), unit="D"))[:10],
            }
        )
    return pd.DataFrame(rows)


def main() -> None:
    ensure_runtime_dirs()
    DATA_DIR.mkdir(parents=True, exist_ok=True)

    transaction_train, transaction_test = _build_transaction_dataset()
    credit_train, credit_test = _build_credit_dataset()
    kyc_train, kyc_test = _build_kyc_dataset()
    sanctions_df = _build_sanctions_dataset()

    transaction_train.to_csv(DATA_DIR / "transaction_fraud_train.csv", index=False)
    transaction_test.to_csv(DATA_DIR / "transaction_fraud_test.csv", index=False)
    credit_train.to_csv(DATA_DIR / "credit_risk_train.csv", index=False)
    credit_test.to_csv(DATA_DIR / "credit_risk_test.csv", index=False)
    kyc_train.to_csv(DATA_DIR / "kyc_identity_train.csv", index=False)
    kyc_test.to_csv(DATA_DIR / "kyc_identity_test.csv", index=False)
    sanctions_df.to_csv(DATA_DIR / "sanctions_pep_list.csv", index=False)

    summaries = [
        DatasetSummary("transaction_fraud_train", len(transaction_train), float(transaction_train["is_fraud"].mean())),
        DatasetSummary("transaction_fraud_test", len(transaction_test), float(transaction_test["is_fraud"].mean())),
        DatasetSummary("credit_risk_train", len(credit_train), float(credit_train["is_default"].mean())),
        DatasetSummary("credit_risk_test", len(credit_test), float(credit_test["is_default"].mean())),
        DatasetSummary("kyc_identity_test", len(kyc_test), float(kyc_test["is_anomaly"].mean())),
    ]

    for summary in summaries:
        print(f"{summary.name}: rows={summary.rows}, positive_rate={summary.positive_rate:.4f}")
    print(f"sanctions_pep_list: rows={len(sanctions_df)}")


if __name__ == "__main__":
    main()