import math from collections import defaultdict from typing import Dict, Iterable import pandas as pd import pandas as pd from itertools import combinations import scipy def compute_pct_improvement_over_baseline( df: pd.DataFrame, baseline_model: str = "xgboost_ensemble", metric: str = "Accuracy" ) -> pd.DataFrame: """ Compute the percentage improvement of each model over a baseline model. For each dataset, computes: ((model_metric - baseline_metric) / baseline_metric) * 100 Args: df: DataFrame with columns 'model', 'dataset_name', and the metric column baseline_model: The model to use as baseline (default: "xgboost_ensemble") metric: The metric to compute improvement on (default: "Accuracy") Returns: DataFrame with a new 'Pct_Improvement_over_XGBoost' column """ if df.empty: return df df = df.copy() # Get baseline scores per dataset baseline_scores = ( df[df["model"] == baseline_model] .set_index("dataset_name")[metric] .to_dict() ) # Compute percentage improvement for each row def calc_pct_improvement(row): baseline = baseline_scores.get(row["dataset_name"]) if baseline is None or baseline == 0: return None return ((row[metric] - baseline) / baseline) * 100 df["%↗ over XGBoost"] = df.apply(calc_pct_improvement, axis=1) return df def scores_to_battles(df: pd.DataFrame, metric: str = "Accuracy") -> pd.DataFrame: battles = [] for dataset, group in df.groupby("dataset_name"): # Sort classifiers in descending order of metric group_sorted = group.sort_values(by=metric, ascending=False) for (i1, row1), (i2, row2) in combinations(group_sorted.iterrows(), 2): if row1[metric] == row2[metric]: winner = "tie" elif row1[metric] > row2[metric]: winner = "model_a" else: winner = "model_b" battles.append({ "model_a": row1["model"], "model_b": row2["model"], "winner": winner, "dataset": dataset, }) return battles def _sigmoid(x: float, eps: float = 1e-7) -> float: """Stable sigmoid with clipped output.""" val = 0.5 * (1 + math.tanh(0.5 * x)) return max(eps, min(1.0 - eps, val)) def compute_bt_elo( battles: Iterable[Dict[str, str]], SCALE: float = 400.0, BASE: float = 10.0, INIT_RATING: float = 1000.0, lr: float = 0.05, n_iter: int = 1000, use_scipy: bool = True, ) -> Dict[str, float]: """Fit a Bradley--Terry model. ``BASE`` controls the link function scale. If ``BASE=10`` (the default), the win probability follows the usual Elo form ``P(win) = 1 / (1 + BASE ** ((rating_b - rating_a) / SCALE))``. The function will use :mod:`scipy.optimize` if available for a fast optimisation of the negative log-likelihood. If SciPy is not installed, it falls back to the simple gradient-descent routine previously used. """ models = sorted({b["model_a"] for b in battles} | {b["model_b"] for b in battles}) battles_list = list(battles) if use_scipy: try: import numpy as np from scipy.optimize import minimize except Exception: # pragma: no cover - SciPy not available use_scipy = False if use_scipy: idx = {m: k for k, m in enumerate(models)} def nll(theta_vec: "np.ndarray") -> float: loss = 0.0 for row in battles_list: i = idx[row["model_a"]] j = idx[row["model_b"]] s = math.log(BASE) * (theta_vec[i] - theta_vec[j]) p = _sigmoid(s) y = 1.0 if row["winner"] == "model_a" else 0.0 if str(row["winner"]).startswith("tie"): y = 0.5 # Binary cross entropy with y in [0, 1] loss -= y * math.log(p) + (1 - y) * math.log(1 - p) return loss theta0 = [0.0] * len(models) res = minimize(nll, theta0, method="BFGS") theta_opt = res.x - sum(res.x) / len(res.x) theta = {m: theta_opt[idx[m]] for m in models} else: theta = {m: 0.0 for m in models} for _ in range(n_iter): grad = {m: 0.0 for m in models} for row in battles_list: i = row["model_a"] j = row["model_b"] w = row["winner"] y = 1.0 if w == "model_a" else 0.0 if str(w).startswith("tie"): y = 0.5 s = math.log(BASE) * (theta[i] - theta[j]) p = _sigmoid(s) diff = (p - y) * math.log(BASE) grad[i] += diff grad[j] -= diff for m in models: theta[m] -= lr * grad[m] / len(battles_list) return {m: SCALE * theta[m] + INIT_RATING for m in sorted(models, key=lambda x: -theta[x])}