File size: 3,418 Bytes
cc1ad5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from __future__ import annotations

import importlib
import json
import sys
from pathlib import Path

import pandas as pd

ROOT_DIR = Path(__file__).resolve().parents[1]
if str(ROOT_DIR) not in sys.path:
    sys.path.insert(0, str(ROOT_DIR))

from agents.common import APP_VERSION, DATA_DIR, save_metadata, utc_now_iso
from agents.credit_risk.model import train_model as train_credit_model
from agents.kyc_identity.model import train_model as train_kyc_model
from agents.transaction_fraud.model import train_model as train_transaction_model
from data.generate_all import main as generate_all_datasets


def _typo(name: str) -> str:
    parts = name.split()
    if not parts:
        return name
    target = max(parts, key=len)
    if len(target) < 5:
        return name
    chars = list(target)
    chars[1], chars[2] = chars[2], chars[1]
    mutated = "".join(chars)
    return name.replace(target, mutated, 1)


def _validate_sanctions() -> dict[str, object]:
    sanctions_module = importlib.import_module("agents.sanctions_pep.matcher")
    sanctions_module = importlib.reload(sanctions_module)
    sanctions_df = pd.read_csv(DATA_DIR / "sanctions_pep_list.csv")

    exact_names = sanctions_df["full_name"].head(5).tolist()
    fuzzy_names = [_typo(name) for name in sanctions_df["full_name"].iloc[5:10].tolist()]
    misses = ["Alice Walker", "Daniel Mercer", "Nina Holloway", "Peter Whitmore", "Alicia Stone"]

    total = 0
    passed = 0

    for name in exact_names:
        total += 1
        result = sanctions_module.screen({"query_name": name})
        passed += int(result["match_found"] and result["best_match"]["match_score"] == 1.0)

    for name in fuzzy_names:
        total += 1
        result = sanctions_module.screen({"query_name": name})
        passed += int(result["match_found"] and result["best_match"]["match_score"] >= 0.7)

    for name in misses:
        total += 1
        result = sanctions_module.screen({"query_name": name})
        passed += int((not result["match_found"]) or (result["best_match"]["match_score"] < 0.6))

    hit_rate = passed / total
    if hit_rate < 0.95:
        raise RuntimeError(f"sanctions_pep hit_rate below threshold: {hit_rate:.4f}")

    return {
        "version": APP_VERSION,
        "artifact": str((DATA_DIR / "sanctions_pep_list.csv").relative_to(ROOT_DIR)),
        "metrics": {"hit_rate": round(hit_rate, 4)},
    }


def main() -> None:
    generate_all_datasets()

    metadata = {
        "version": APP_VERSION,
        "training_date": utc_now_iso(),
        "models": {},
    }

    metadata["models"]["transaction_fraud"] = train_transaction_model(
        DATA_DIR / "transaction_fraud_train.csv",
        DATA_DIR / "transaction_fraud_test.csv",
    )
    metadata["models"]["credit_risk"] = train_credit_model(
        DATA_DIR / "credit_risk_train.csv",
        DATA_DIR / "credit_risk_test.csv",
    )
    metadata["models"]["kyc_identity"] = train_kyc_model(
        DATA_DIR / "kyc_identity_train.csv",
        DATA_DIR / "kyc_identity_test.csv",
    )
    metadata["models"]["sanctions_pep"] = _validate_sanctions()
    metadata["models"]["risk_consultant"] = {
        "version": APP_VERSION,
        "artifact": "env:LLM_API_KEY or static FAQ fallback",
        "metrics": {"source": "llm_or_static_faq"},
    }

    save_metadata(metadata)
    print(json.dumps(metadata, indent=2))


if __name__ == "__main__":
    main()