test_final / evaluation /grouped_split_benchmark.py
k22056537
feat: sync integration updates across app and ML pipeline
eb4abb8
raw
history blame
3.6 kB
"""Compare pooled random split vs grouped LOPO for XGBoost."""
import os
import sys
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
_PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
if _PROJECT_ROOT not in sys.path:
sys.path.insert(0, _PROJECT_ROOT)
from data_preparation.prepare_dataset import get_default_split_config, get_numpy_splits, load_per_person
from models.xgboost.config import build_xgb_classifier
MODEL_NAME = "face_orientation"
OUT_PATH = os.path.join(_PROJECT_ROOT, "evaluation", "GROUPED_SPLIT_BENCHMARK.md")
def run_pooled_split():
split_ratios, seed = get_default_split_config()
splits, _, _, _ = get_numpy_splits(
model_name=MODEL_NAME,
split_ratios=split_ratios,
seed=seed,
scale=False,
)
model = build_xgb_classifier(seed, verbosity=0, early_stopping_rounds=30)
model.fit(
splits["X_train"],
splits["y_train"],
eval_set=[(splits["X_val"], splits["y_val"])],
verbose=False,
)
probs = model.predict_proba(splits["X_test"])[:, 1]
preds = (probs >= 0.5).astype(int)
y = splits["y_test"]
return {
"accuracy": float(accuracy_score(y, preds)),
"f1": float(f1_score(y, preds, average="weighted")),
"auc": float(roc_auc_score(y, probs)),
}
def run_grouped_lopo():
by_person, _, _ = load_per_person(MODEL_NAME)
persons = sorted(by_person.keys())
scores = {"accuracy": [], "f1": [], "auc": []}
_, seed = get_default_split_config()
for held_out in persons:
train_x = np.concatenate([by_person[p][0] for p in persons if p != held_out], axis=0)
train_y = np.concatenate([by_person[p][1] for p in persons if p != held_out], axis=0)
test_x, test_y = by_person[held_out]
model = build_xgb_classifier(seed, verbosity=0)
model.fit(train_x, train_y, verbose=False)
probs = model.predict_proba(test_x)[:, 1]
preds = (probs >= 0.5).astype(int)
scores["accuracy"].append(float(accuracy_score(test_y, preds)))
scores["f1"].append(float(f1_score(test_y, preds, average="weighted")))
scores["auc"].append(float(roc_auc_score(test_y, probs)))
return {
"accuracy": float(np.mean(scores["accuracy"])),
"f1": float(np.mean(scores["f1"])),
"auc": float(np.mean(scores["auc"])),
"folds": len(persons),
}
def write_report(pooled, grouped):
lines = [
"# Grouped vs pooled split benchmark",
"",
"This compares the same XGBoost config under two evaluation protocols.",
"",
f"Config: `{XGB_BASE_PARAMS}`",
"",
"| Protocol | Accuracy | F1 (weighted) | ROC-AUC |",
"|----------|---------:|--------------:|--------:|",
f"| Pooled random split (70/15/15) | {pooled['accuracy']:.4f} | {pooled['f1']:.4f} | {pooled['auc']:.4f} |",
f"| Grouped LOPO ({grouped['folds']} folds) | {grouped['accuracy']:.4f} | {grouped['f1']:.4f} | {grouped['auc']:.4f} |",
"",
"Use grouped LOPO as the primary generalisation metric when reporting model quality.",
"",
]
with open(OUT_PATH, "w", encoding="utf-8") as f:
f.write("\n".join(lines))
print(f"[LOG] Wrote {OUT_PATH}")
def main():
pooled = run_pooled_split()
grouped = run_grouped_lopo()
write_report(pooled, grouped)
print(
"[DONE] pooled_f1={:.4f} grouped_f1={:.4f}".format(
pooled["f1"], grouped["f1"]
)
)
if __name__ == "__main__":
main()