Spaces:

Danielfonseca1212
/

RelGNNDeepRelationalLearning

Sleeping

App Files Files Community

Danielfonseca1212 commited on Mar 4

Commit

cf49f9c

verified ·

1 Parent(s): c9959a3

Create trainer.py

Browse files

Files changed (1) hide show

trainer.py +232 -0

trainer.py ADDED Viewed

	@@ -0,0 +1,232 @@

+"""
+relgnn/trainer.py
+Loop de treinamento do RelGNN.
+Extrai features numéricas diretamente das tabelas SQL (sem grafo),
+agrega por entidade alvo (customers), e treina end-to-end.
+"""
+import time
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
+from typing import Dict, List, Tuple, Callable, Optional
+from data.routes import AtomicRoute
+# ─── FEATURE EXTRACTION ───────────────────────────────────────────────────────
+NUMERIC_COLS = {
+    "customers": ["c_acctbal", "c_nationkey", "c_account_age_days", "c_num_prev_orders"],
+    "orders":    ["o_totalprice", "o_shippriority"],
+    "lineitem":  ["l_quantity", "l_extendedprice", "l_discount", "l_tax"],
+    "supplier":  ["s_acctbal", "s_nationkey", "s_risk_flag"],
+    "nation":    ["n_nationkey", "n_regionkey"],
+    "part":      ["p_retailprice"],
+}
+def extract_features(tables: Dict, n_customers: int) -> Tuple[Dict, np.ndarray]:
+    """
+    Extrai features numéricas das tabelas e agrega por cliente (entidade alvo).
+    Retorna:
+        table_features: {table_name: np.ndarray [n_customers, feature_dim]}
+        labels:         np.ndarray [n_customers]  (is_fraud)
+    """
+    import pandas as pd
+    customers = tables["customers"]
+    orders    = tables["orders"]
+    # Labels: 1 se algum pedido do cliente é fraude
+    fraud_by_customer = orders.groupby("o_custkey")["is_fraud"].max()
+    labels = customers["c_custkey"].map(fraud_by_customer).fillna(0).values.astype(float)
+    table_features = {}
+    # ── Customers: direto ─────────────────────────────────────────────────────
+    cols = [c for c in NUMERIC_COLS["customers"] if c in customers.columns]
+    table_features["customers"] = customers[cols].fillna(0).values.astype(np.float32)
+    # ── Orders: agrega por cliente (mean + max + count) ───────────────────────
+    order_cols = [c for c in NUMERIC_COLS["orders"] if c in orders.columns]
+    ord_mean = orders.groupby("o_custkey")[order_cols].mean()
+    ord_max  = orders.groupby("o_custkey")[order_cols].max()
+    ord_cnt  = orders.groupby("o_custkey").size().rename("order_count")
+    ord_agg  = ord_mean.join(ord_max, rsuffix="_max").join(ord_cnt)
+    ord_agg  = customers[["c_custkey"]].set_index("c_custkey").join(ord_agg).fillna(0)
+    table_features["orders"] = ord_agg.values.astype(np.float32)
+    # ── Lineitem: agrega via orders → customer ────────────────────────────────
+    lineitem = tables["lineitem"]
+    li_cols  = [c for c in NUMERIC_COLS["lineitem"] if c in lineitem.columns]
+    li_with_cust = lineitem.merge(
+        orders[["o_orderkey", "o_custkey"]], on="o_orderkey", how="left"
+    )
+    li_mean = li_with_cust.groupby("o_custkey")[li_cols].mean()
+    li_max  = li_with_cust.groupby("o_custkey")[li_cols].max()
+    li_cnt  = li_with_cust.groupby("o_custkey").size().rename("lineitem_count")
+    li_agg  = li_mean.join(li_max, rsuffix="_max").join(li_cnt)
+    li_agg  = customers[["c_custkey"]].set_index("c_custkey").join(li_agg).fillna(0)
+    table_features["lineitem"] = li_agg.values.astype(np.float32)
+    # ── Supplier: agrega via lineitem → orders → customer ────────────────────
+    supplier = tables["supplier"]
+    sup_cols = [c for c in NUMERIC_COLS["supplier"] if c in supplier.columns]
+    sup_with_cust = li_with_cust.merge(supplier, left_on="l_suppkey", right_on="s_suppkey", how="left")
+    sup_mean = sup_with_cust.groupby("o_custkey")[sup_cols].mean()
+    sup_agg  = customers[["c_custkey"]].set_index("c_custkey").join(sup_mean).fillna(0)
+    table_features["supplier"] = sup_agg.values.astype(np.float32)
+    # ── Nation: join direto ───────────────────────────────────────────────────
+    nation   = tables["nation"]
+    nat_cols = [c for c in NUMERIC_COLS["nation"] if c in nation.columns]
+    nat_agg  = customers[["c_custkey", "c_nationkey"]].merge(
+        nation, left_on="c_nationkey", right_on="n_nationkey", how="left"
+    )[nat_cols].fillna(0)
+    table_features["nation"] = nat_agg.values.astype(np.float32)
+    # ── Part: agrega via lineitem → customer ──────────────────────────────────
+    part     = tables["part"]
+    par_cols = [c for c in NUMERIC_COLS["part"] if c in part.columns]
+    par_with_cust = li_with_cust.merge(part, left_on="l_partkey", right_on="p_partkey", how="left")
+    par_mean = par_with_cust.groupby("o_custkey")[par_cols].mean()
+    par_agg  = customers[["c_custkey"]].set_index("c_custkey").join(par_mean).fillna(0)
+    table_features["part"] = par_agg.values.astype(np.float32)
+    # Normaliza features (min-max por coluna)
+    for key in table_features:
+        feat = table_features[key]
+        col_min = feat.min(axis=0, keepdims=True)
+        col_max = feat.max(axis=0, keepdims=True)
+        denom = np.where((col_max - col_min) == 0, 1, col_max - col_min)
+        table_features[key] = (feat - col_min) / denom
+    return table_features, labels
+# ─── TRAINER ─────────────────────────────────────────────────────────────────
+class Trainer:
+    def __init__(self, model, config):
+        self.model  = model
+        self.config = config
+    def fit(
+        self,
+        tables: Dict,
+        routes: List[AtomicRoute],
+        log_fn: Callable = print,
+        progress_fn=None,
+    ) -> Tuple[Dict, List[Dict]]:
+        t_start = time.time()
+        H = self.config.hidden_dim
+        D = self.config.dropout
+        LR = self.config.learning_rate
+        EPOCHS = self.config.num_epochs
+        # 1. Extrai features
+        table_features_np, labels = extract_features(tables, len(tables["customers"]))
+        feature_dims = {k: v.shape[1] for k, v in table_features_np.items()}
+        # 2. Build do modelo (agora que sabemos as dims)
+        self.model.build(feature_dims, routes)
+        optimizer = optim.AdamW(self.model.parameters(), lr=LR, weight_decay=1e-4)
+        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)
+        # 3. Split treino/teste estratificado
+        n = len(labels)
+        idx = np.arange(n)
+        idx_tr, idx_te = train_test_split(idx, test_size=0.2, random_state=42,
+                                           stratify=(labels > 0.5).astype(int))
+        def to_tensor(feat_dict, idx):
+            return {k: torch.tensor(v[idx], dtype=torch.float32)
+                    for k, v in feat_dict.items()}
+        y_tr = torch.tensor(labels[idx_tr], dtype=torch.float32)
+        y_te = torch.tensor(labels[idx_te], dtype=torch.float32)
+        # Peso para classe positiva (fraude é rara)
+        pos_weight = torch.tensor([(y_tr == 0).sum() / max((y_tr == 1).sum(), 1)])
+        loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
+        history = []
+        log_interval = max(1, EPOCHS // 10)
+        self.model.train()
+        for epoch in range(1, EPOCHS + 1):
+            optimizer.zero_grad()
+            feat_tr = to_tensor(table_features_np, idx_tr)
+            logits, _ = self.model(feat_tr)
+            loss = loss_fn(logits, y_tr)
+            loss.backward()
+            nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
+            optimizer.step()
+            scheduler.step()
+            if epoch % log_interval == 0 or epoch == EPOCHS:
+                self.model.eval()
+                with torch.no_grad():
+                    feat_te = to_tensor(table_features_np, idx_te)
+                    logits_te, _ = self.model(feat_te)
+                    probs_te = torch.sigmoid(logits_te).numpy()
+                try:
+                    auc = roc_auc_score(labels[idx_te], probs_te)
+                except Exception:
+                    auc = 0.5
+                history.append({"epoch": epoch, "loss": float(loss), "auc": auc})
+                if epoch % (log_interval * 2) == 0 or epoch == EPOCHS:
+                    log_fn(f"   Época {epoch:3d}/{EPOCHS} | Loss: {float(loss):.4f} | AUC: {auc:.4f}")
+                self.model.train()
+                if progress_fn:
+                    pct = 0.30 + 0.35 * (epoch / EPOCHS)
+                    progress_fn(pct, desc=f"RelGNN treino — época {epoch}/{EPOCHS}")
+        # Métricas finais
+        self.model.eval()
+        with torch.no_grad():
+            feat_te = to_tensor(table_features_np, idx_te)
+            logits_te, attn_info = self.model(feat_te)
+            probs_te = torch.sigmoid(logits_te).numpy()
+        preds = (probs_te > 0.5).astype(int)
+        y_true = labels[idx_te].astype(int)
+        try:
+            auc       = roc_auc_score(y_true, probs_te)
+            f1        = f1_score(y_true, preds, zero_division=0)
+            precision = precision_score(y_true, preds, zero_division=0)
+            recall    = recall_score(y_true, preds, zero_division=0)
+        except Exception:
+            auc = f1 = precision = recall = 0.5
+        train_time = round(time.time() - t_start, 1)
+        metrics = {
+            "auc":        round(auc, 4),
+            "f1":         round(f1, 4),
+            "precision":  round(precision, 4),
+            "recall":     round(recall, 4),
+            "train_time": train_time,
+        }
+        # Atualiza pesos de atenção nas rotas com valores reais
+        route_weights = torch.softmax(self.model.hierarchical.route_weights, dim=0)
+        for i, route in enumerate(routes):
+            if i < len(route_weights):
+                route.attention_weight = float(route_weights[i].item())
+                route.active = route.attention_weight > 0.15
+        return metrics, history