Spaces:

Danielfonseca1212
/

RelGNNDeepRelationalLearning

Sleeping

App Files Files Community

Danielfonseca1212 commited on Mar 4

Commit

22b1610

verified ·

1 Parent(s): d27f646

Create Xgboost baseline · py

Browse files

Files changed (1) hide show

Xgboost baseline · py +134 -0

Xgboost baseline · py ADDED Viewed

	@@ -0,0 +1,134 @@

+"""
+baseline/xgboost_baseline.py
+Baseline XGBoost — features planas (flat features).
+Agrega todas as tabelas em uma única linha por cliente e treina XGBoost.
+Representa a abordagem clássica de ML sem estrutura relacional.
+"""
+import time
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
+from sklearn.ensemble import GradientBoostingClassifier
+from typing import Dict, Callable
+class XGBoostBaseline:
+    """
+    Usa GradientBoostingClassifier do scikit-learn (equivalente ao XGBoost)
+    para máxima compatibilidade no HF Spaces sem dependências extras.
+    """
+    def __init__(self, n_estimators: int = 100, max_depth: int = 4):
+        self.n_estimators = n_estimators
+        self.max_depth    = max_depth
+    def _build_flat_features(self, tables: Dict) -> pd.DataFrame:
+        """
+        Achata todas as tabelas em um DataFrame por cliente.
+        Engenharia de features manual — exatamente o que RelGNN evita.
+        """
+        customers = tables["customers"]
+        orders    = tables["orders"]
+        lineitem  = tables["lineitem"]
+        supplier  = tables["supplier"]
+        nation    = tables["nation"]
+        feat = customers[["c_custkey", "c_acctbal", "c_nationkey",
+                           "c_account_age_days", "c_num_prev_orders"]].copy()
+        # Agrega pedidos
+        ord_agg = orders.groupby("o_custkey").agg(
+            ord_count       = ("o_orderkey",   "count"),
+            ord_total_mean  = ("o_totalprice",  "mean"),
+            ord_total_max   = ("o_totalprice",  "max"),
+            ord_total_std   = ("o_totalprice",  "std"),
+            ord_priority_mean=("o_shippriority","mean"),
+        ).reset_index().rename(columns={"o_custkey": "c_custkey"})
+        feat = feat.merge(ord_agg, on="c_custkey", how="left")
+        # Agrega linhas de pedido
+        li_with_cust = lineitem.merge(
+            orders[["o_orderkey","o_custkey"]], on="o_orderkey", how="left"
+        )
+        li_agg = li_with_cust.groupby("o_custkey").agg(
+            li_count        = ("l_linenumber", "count"),
+            li_qty_mean     = ("l_quantity",   "mean"),
+            li_price_mean   = ("l_extendedprice","mean"),
+            li_price_max    = ("l_extendedprice","max"),
+            li_discount_mean= ("l_discount",   "mean"),
+            li_tax_mean     = ("l_tax",        "mean"),
+        ).reset_index().rename(columns={"o_custkey": "c_custkey"})
+        feat = feat.merge(li_agg, on="c_custkey", how="left")
+        # Agrega fornecedores via lineitem
+        sup_with_cust = li_with_cust.merge(supplier, left_on="l_suppkey",
+                                            right_on="s_suppkey", how="left")
+        sup_agg = sup_with_cust.groupby("o_custkey").agg(
+            sup_acctbal_mean = ("s_acctbal",    "mean"),
+            sup_risk_sum     = ("s_risk_flag",  "sum"),
+            sup_nation_nuniq = ("s_nationkey",  "nunique"),
+        ).reset_index().rename(columns={"o_custkey": "c_custkey"})
+        feat = feat.merge(sup_agg, on="c_custkey", how="left")
+        # Agrega nação
+        nat_agg = nation[["n_nationkey","n_regionkey"]].rename(
+            columns={"n_nationkey": "c_nationkey"}
+        )
+        feat = feat.merge(nat_agg, on="c_nationkey", how="left")
+        feat = feat.drop(columns=["c_custkey"], errors="ignore")
+        feat = feat.fillna(0)
+        return feat
+    def fit(self, tables: Dict, log_fn: Callable = print):
+        t_start = time.time()
+        log_fn("   [XGBoost] Construindo features planas (flat)...")
+        X = self._build_flat_features(tables)
+        # Labels
+        customers = tables["customers"]
+        orders    = tables["orders"]
+        fraud_by_cust = orders.groupby("o_custkey")["is_fraud"].max()
+        y = customers["c_custkey"].map(fraud_by_cust).fillna(0).values.astype(int)
+        X_arr = X.values.astype(np.float32)
+        log_fn(f"   [XGBoost] Shape features: {X_arr.shape}")
+        idx_tr, idx_te = train_test_split(
+            np.arange(len(y)), test_size=0.2, random_state=42,
+            stratify=y
+        )
+        model = GradientBoostingClassifier(
+            n_estimators=self.n_estimators,
+            max_depth=self.max_depth,
+            learning_rate=0.05,
+            subsample=0.8,
+            random_state=42,
+        )
+        model.fit(X_arr[idx_tr], y[idx_tr])
+        probs = model.predict_proba(X_arr[idx_te])[:, 1]
+        preds = (probs > 0.5).astype(int)
+        y_true = y[idx_te]
+        try:
+            auc       = roc_auc_score(y_true, probs)
+            f1        = f1_score(y_true, preds, zero_division=0)
+            precision = precision_score(y_true, preds, zero_division=0)
+            recall    = recall_score(y_true, preds, zero_division=0)
+        except Exception:
+            auc = f1 = precision = recall = 0.5
+        train_time = round(time.time() - t_start, 1)
+        return {
+            "auc": round(auc, 4), "f1": round(f1, 4),
+            "precision": round(precision, 4), "recall": round(recall, 4),
+            "train_time": train_time,
+        }