Spaces:

waltertaya
/

aml-v2

Runtime error

App Files Files Community

waltertaya commited on Mar 7

Commit

3b50aa4

verified ·

1 Parent(s): 327f9ba

Upload 2 files

Browse files

Files changed (2) hide show

app.py +394 -0
requirements.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,394 @@

+import os
+import json
+import tempfile
+from typing import List, Dict, Optional, Tuple
+import gradio as gr
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from huggingface_hub import hf_hub_download
+from sklearn.metrics import average_precision_score, precision_recall_fscore_support, roc_auc_score
+from torch_geometric.data import Data
+from torch_geometric.nn import SAGEConv
+MODEL_REPO_ID = os.getenv("MODEL_REPO_ID", "walter-taya/aml-gnn-ibm-baseline-medium")
+MODEL_REPO_TYPE = "model"
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+TARGET_COL_CANDIDATES = ["is_laundering", "is laundering", "is_sar", "label", "target", "y"]
+SRC_COL_CANDIDATES = ["from_account", "originator", "sender", "nameorig", "account", "src", "source"]
+DST_COL_CANDIDATES = ["to_account", "beneficiary", "receiver", "namedest", "account.1", "dst", "target_account"]
+AMOUNT_COL_CANDIDATES = ["amount", "amount_paid", "payment_amount", "transaction_amount", "amt"]
+TIME_COL_CANDIDATES = ["timestamp", "step", "time", "date", "tran_date", "transaction_date"]
+CURRENCY_COL_CANDIDATES = ["currency", "payment_currency", "ccy", "cur"]
+PAYMENT_TYPE_COL_CANDIDATES = ["payment_type", "type", "transaction_type", "channel", "payment_method"]
+CHANNEL_COL_CANDIDATES = ["channel", "delivery_channel", "device_channel", "network"]
+def _normalize_name(col: str) -> str:
+	return str(col).strip().lower().replace(" ", "_")
+def find_column(df: pd.DataFrame, candidates: List[str], required: bool = True) -> Optional[str]:
+	norm_map = {_normalize_name(c): c for c in df.columns}
+	for cand in candidates:
+		cand_norm = _normalize_name(cand)
+		if cand_norm in norm_map:
+			return norm_map[cand_norm]
+	if required:
+		raise KeyError(f"None of the candidate columns found: {candidates}")
+	return None
+def to_binary_label(series: pd.Series) -> pd.Series:
+	if series.dtype == bool:
+		return series.astype(int)
+	if np.issubdtype(series.dtype, np.number):
+		return (series > 0).astype(int)
+	s = series.astype(str).str.strip().str.lower()
+	positives = {"1", "true", "yes", "sar", "laundering", "suspicious", "fraud"}
+	return s.isin(positives).astype(int)
+def safe_amount(x):
+	if pd.isna(x):
+		return 0.0
+	try:
+		return float(str(x).replace(",", ""))
+	except Exception:
+		return 0.0
+def add_time_features(df: pd.DataFrame) -> pd.DataFrame:
+	if "time" not in df.columns:
+		df["hour"] = 0
+		df["dayofweek"] = 0
+		return df
+	t = df["time"]
+	if np.issubdtype(t.dtype, np.number):
+		hour = (t.astype(float) % 24).fillna(0)
+		day = ((t.astype(float) // 24) % 7).fillna(0)
+		df["hour"] = hour.astype(int)
+		df["dayofweek"] = day.astype(int)
+		return df
+	dt = pd.to_datetime(t, errors="coerce")
+	df["hour"] = dt.dt.hour.fillna(0).astype(int)
+	df["dayofweek"] = dt.dt.dayofweek.fillna(0).astype(int)
+	return df
+class EdgeGNN(nn.Module):
+	def __init__(self, in_dim: int, edge_dim: int, hidden_dim: int, dropout: float = 0.2):
+		super().__init__()
+		self.conv1 = SAGEConv(in_dim, hidden_dim)
+		self.conv2 = SAGEConv(hidden_dim, hidden_dim)
+		self.dropout = dropout
+		self.edge_mlp = nn.Sequential(
+			nn.Linear(hidden_dim * 2 + edge_dim, hidden_dim),
+			nn.ReLU(),
+			nn.Dropout(dropout),
+			nn.Linear(hidden_dim, 1),
+		)
+	def encode_nodes(self, x, edge_index):
+		h = self.conv1(x, edge_index)
+		h = F.relu(h)
+		h = F.dropout(h, p=self.dropout, training=self.training)
+		h = self.conv2(h, edge_index)
+		return h
+	def edge_logits(self, node_emb, edge_index, edge_attr):
+		src = edge_index[0]
+		dst = edge_index[1]
+		edge_feat = torch.cat([node_emb[src], node_emb[dst], edge_attr], dim=1)
+		return self.edge_mlp(edge_feat).squeeze(-1)
+	def forward(self, x, edge_index, edge_attr):
+		node_emb = self.encode_nodes(x, edge_index)
+		return self.edge_logits(node_emb, edge_index, edge_attr)
+def compute_metrics(y_true: np.ndarray, y_prob: np.ndarray, threshold: float) -> Dict[str, float]:
+	y_pred = (y_prob >= threshold).astype(int)
+	p, r, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
+	metrics = {"precision": float(p), "recall": float(r), "f1": float(f1)}
+	if len(np.unique(y_true)) > 1:
+		metrics["roc_auc"] = float(roc_auc_score(y_true, y_prob))
+		metrics["pr_auc"] = float(average_precision_score(y_true, y_prob))
+	return metrics
+def prepare_graph(df: pd.DataFrame) -> Tuple[pd.DataFrame, Data, torch.Tensor, Dict[str, str]]:
+	src_col = find_column(df, SRC_COL_CANDIDATES)
+	dst_col = find_column(df, DST_COL_CANDIDATES)
+	amount_col = find_column(df, AMOUNT_COL_CANDIDATES)
+	label_col = find_column(df, TARGET_COL_CANDIDATES, required=False)
+	time_col = find_column(df, TIME_COL_CANDIDATES, required=False)
+	currency_col = find_column(df, CURRENCY_COL_CANDIDATES, required=False)
+	payment_type_col = find_column(df, PAYMENT_TYPE_COL_CANDIDATES, required=False)
+	channel_col = find_column(df, CHANNEL_COL_CANDIDATES, required=False)
+	selected = [src_col, dst_col, amount_col]
+	rename_to = ["src", "dst", "amount"]
+	if label_col:
+		selected.append(label_col)
+		rename_to.append("label")
+	if time_col:
+		selected.append(time_col)
+		rename_to.append("time")
+	if currency_col:
+		selected.append(currency_col)
+		rename_to.append("currency")
+	if payment_type_col:
+		selected.append(payment_type_col)
+		rename_to.append("payment_type")
+	if channel_col:
+		selected.append(channel_col)
+		rename_to.append("channel")
+	work_df = df[selected].copy()
+	work_df.columns = rename_to
+	work_df["amount"] = work_df["amount"].apply(safe_amount).astype(float)
+	work_df = work_df.dropna(subset=["src", "dst"]).reset_index(drop=True)
+	work_df["src"] = work_df["src"].astype(str)
+	work_df["dst"] = work_df["dst"].astype(str)
+	if "label" in work_df.columns:
+		work_df["label"] = to_binary_label(work_df["label"]).astype(int)
+	for col in ["currency", "payment_type", "channel"]:
+		if col in work_df.columns:
+			work_df[col] = work_df[col].astype(str).fillna("UNK")
+	work_df = add_time_features(work_df)
+	all_accounts = pd.Index(work_df["src"]).append(pd.Index(work_df["dst"])).unique()
+	account_to_id = {acc: i for i, acc in enumerate(all_accounts)}
+	work_df["src_id"] = work_df["src"].map(account_to_id)
+	work_df["dst_id"] = work_df["dst"].map(account_to_id)
+	edge_index = torch.tensor(work_df[["src_id", "dst_id"]].to_numpy().T, dtype=torch.long)
+	edge_cont_cols = ["amount", "hour", "dayofweek"]
+	edge_cont = work_df[edge_cont_cols].copy()
+	edge_cont["amount"] = np.log1p(edge_cont["amount"].clip(lower=0.0))
+	edge_cont["hour"] = edge_cont["hour"] / 23.0
+	edge_cont["dayofweek"] = edge_cont["dayofweek"] / 6.0
+	cat_cols = [c for c in ["currency", "payment_type", "channel"] if c in work_df.columns]
+	if cat_cols:
+		edge_cat = pd.get_dummies(work_df[cat_cols], prefix=cat_cols, dummy_na=True)
+		edge_feat_df = pd.concat([edge_cont, edge_cat], axis=1)
+	else:
+		edge_feat_df = edge_cont
+	edge_attr = torch.tensor(edge_feat_df.to_numpy(), dtype=torch.float32)
+	num_nodes = len(all_accounts)
+	node_df = pd.DataFrame(index=np.arange(num_nodes))
+	out_count = work_df.groupby("src_id").size().reindex(node_df.index, fill_value=0)
+	in_count = work_df.groupby("dst_id").size().reindex(node_df.index, fill_value=0)
+	out_amt_sum = work_df.groupby("src_id")["amount"].sum().reindex(node_df.index, fill_value=0.0)
+	in_amt_sum = work_df.groupby("dst_id")["amount"].sum().reindex(node_df.index, fill_value=0.0)
+	out_amt_mean = work_df.groupby("src_id")["amount"].mean().reindex(node_df.index, fill_value=0.0)
+	in_amt_mean = work_df.groupby("dst_id")["amount"].mean().reindex(node_df.index, fill_value=0.0)
+	out_hour_mean = work_df.groupby("src_id")["hour"].mean().reindex(node_df.index, fill_value=0.0)
+	in_hour_mean = work_df.groupby("dst_id")["hour"].mean().reindex(node_df.index, fill_value=0.0)
+	out_night_ratio = (
+		work_df.assign(night=work_df["hour"].isin([0, 1, 2, 3, 4, 5]).astype(int))
+		.groupby("src_id")["night"]
+		.mean()
+		.reindex(node_df.index, fill_value=0.0)
+	)
+	in_night_ratio = (
+		work_df.assign(night=work_df["hour"].isin([0, 1, 2, 3, 4, 5]).astype(int))
+		.groupby("dst_id")["night"]
+		.mean()
+		.reindex(node_df.index, fill_value=0.0)
+	)
+	node_df["out_count"] = out_count
+	node_df["in_count"] = in_count
+	node_df["out_amt_sum"] = out_amt_sum
+	node_df["in_amt_sum"] = in_amt_sum
+	node_df["out_amt_mean"] = out_amt_mean
+	node_df["in_amt_mean"] = in_amt_mean
+	node_df["out_hour_mean"] = out_hour_mean
+	node_df["in_hour_mean"] = in_hour_mean
+	node_df["out_night_ratio"] = out_night_ratio
+	node_df["in_night_ratio"] = in_night_ratio
+	for col in ["out_count", "in_count", "out_amt_sum", "in_amt_sum", "out_amt_mean", "in_amt_mean"]:
+		node_df[col] = np.log1p(node_df[col].clip(lower=0.0))
+	node_x = torch.tensor(node_df.to_numpy(), dtype=torch.float32)
+	data = Data(x=node_x, edge_index=edge_index, edge_attr=edge_attr)
+	mapping = {
+		"src": src_col,
+		"dst": dst_col,
+		"amount": amount_col,
+		"label": label_col or "(not found)",
+		"time": time_col or "(not found)",
+	}
+	return work_df, data, edge_attr, mapping
+def load_model_and_config(repo_id: str):
+	config_path = hf_hub_download(repo_id=repo_id, filename="config.json", repo_type=MODEL_REPO_TYPE)
+	weights_path = hf_hub_download(repo_id=repo_id, filename="pytorch_model.bin", repo_type=MODEL_REPO_TYPE)
+	with open(config_path, "r", encoding="utf-8") as f:
+		config = json.load(f)
+	model = EdgeGNN(
+		in_dim=int(config["in_dim"]),
+		edge_dim=int(config["edge_dim"]),
+		hidden_dim=int(config["hidden_dim"]),
+		dropout=float(config.get("dropout", 0.2)),
+	).to(DEVICE)
+	state_dict = torch.load(weights_path, map_location=DEVICE)
+	model.load_state_dict(state_dict)
+	model.eval()
+	default_threshold = float(config.get("best_threshold", 0.5))
+	return model, config, default_threshold
+MODEL = None
+MODEL_CONFIG = None
+DEFAULT_THRESHOLD = 0.5
+MODEL_LOAD_ERROR = None
+try:
+	MODEL, MODEL_CONFIG, DEFAULT_THRESHOLD = load_model_and_config(MODEL_REPO_ID)
+except Exception as ex:
+	MODEL_LOAD_ERROR = str(ex)
+def score_transactions(file_obj, threshold: float, top_k: int):
+	if MODEL_LOAD_ERROR:
+		return (
+			f"❌ Model load failed from `{MODEL_REPO_ID}`: {MODEL_LOAD_ERROR}",
+			pd.DataFrame(),
+			pd.DataFrame(),
+			None,
+		)
+	if file_obj is None:
+		return "Please upload a CSV file.", pd.DataFrame(), pd.DataFrame(), None
+	try:
+		raw_df = pd.read_csv(file_obj.name)
+		work_df, data, _, mapping = prepare_graph(raw_df)
+		with torch.no_grad():
+			logits = MODEL(
+				data.x.to(DEVICE),
+				data.edge_index.to(DEVICE),
+				data.edge_attr.to(DEVICE),
+			)
+			probs = torch.sigmoid(logits).detach().cpu().numpy()
+		result_df = work_df.copy()
+		result_df["pred_prob"] = probs
+		result_df["pred_label"] = (result_df["pred_prob"] >= threshold).astype(int)
+		result_df = result_df.sort_values("pred_prob", ascending=False).reset_index(drop=True)
+		account_alerts = pd.concat(
+			[
+				result_df[["src", "pred_prob", "pred_label"]].rename(columns={"src": "account"}),
+				result_df[["dst", "pred_prob", "pred_label"]].rename(columns={"dst": "account"}),
+			],
+			axis=0,
+			ignore_index=True,
+		)
+		account_risk = (
+			account_alerts.groupby("account")
+			.agg(
+				max_txn_risk=("pred_prob", "max"),
+				mean_txn_risk=("pred_prob", "mean"),
+				txn_count=("pred_prob", "size"),
+				pred_account_alert=("pred_label", "max"),
+			)
+			.reset_index()
+			.sort_values("max_txn_risk", ascending=False)
+		)
+		metrics_block = ""
+		if "label" in result_df.columns:
+			y_true = result_df["label"].to_numpy().astype(int)
+			if len(np.unique(y_true)) > 1:
+				metrics = compute_metrics(y_true, result_df["pred_prob"].to_numpy(), threshold=threshold)
+				metrics_block = "\n".join([f"- **{k}**: {v:.4f}" for k, v in metrics.items()])
+			else:
+				metrics_block = "- Ground-truth label has only one class; metrics skipped."
+		else:
+			metrics_block = "- Ground-truth label column not found; showing inference-only outputs."
+		summary = (
+			f"✅ Scored **{len(result_df):,}** transactions from `{os.path.basename(file_obj.name)}`\n\n"
+			f"**Model repo**: `{MODEL_REPO_ID}`  \n"
+			f"**Threshold**: `{threshold:.3f}`  \n"
+			f"**Detected schema**: `{mapping}`\n\n"
+			f"**Metrics (if label available)**\n{metrics_block}"
+		)
+		top_txn = result_df.head(max(1, int(top_k)))
+		top_accounts = account_risk.head(max(1, int(top_k)))
+		with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as tmp:
+			result_df.to_csv(tmp.name, index=False)
+			out_path = tmp.name
+		return summary, top_txn, top_accounts, out_path
+	except Exception as ex:
+		return f"❌ Inference failed: {ex}", pd.DataFrame(), pd.DataFrame(), None
+with gr.Blocks(title="AML GNN Inference Space") as demo:
+	gr.Markdown(
+		"""
+		# AML Transaction Risk Scoring (GNN)
+		Upload a transaction CSV and score suspicious transactions using the model from Hugging Face Hub.
+		**Expected columns (flexible names supported):** source account, destination account, amount.
+		Optional: label, timestamp/time, currency, payment type, channel.
+		"""
+	)
+	with gr.Row():
+		file_input = gr.File(label="Upload transaction CSV", file_types=[".csv"])
+		threshold = gr.Slider(0.05, 0.95, value=DEFAULT_THRESHOLD, step=0.01, label="Decision threshold")
+		top_k = gr.Slider(5, 100, value=20, step=1, label="Top rows to display")
+	run_btn = gr.Button("Score Transactions", variant="primary")
+	summary = gr.Markdown(label="Run summary")
+	with gr.Tab("Top suspicious transactions"):
+		top_txn_df = gr.Dataframe(label="Top scored transactions")
+	with gr.Tab("Top risky accounts"):
+		top_acc_df = gr.Dataframe(label="Top account risks")
+	download_file = gr.File(label="Download full scored CSV")
+	run_btn.click(
+		fn=score_transactions,
+		inputs=[file_input, threshold, top_k],
+		outputs=[summary, top_txn_df, top_acc_df, download_file],
+	)
+if __name__ == "__main__":
+	demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio>=5.0.0
+pandas>=2.0.0
+numpy>=1.24.0
+scikit-learn>=1.3.0
+torch>=2.2.0
+torch-geometric>=2.5.0
+huggingface_hub>=0.24.0