SignalMod / src /pipeline /run_golden_baseline_pipeline.py
Mirae Kang
feat: implement new models and improve UI, #23
46cc63a
"""
Golden Baseline strategy β€” frozen pretrained BERT, R-Drop squeeze, hybrid safety net.
uv run python -m src.pipeline.run_golden_baseline_pipeline
"""
from __future__ import annotations
import argparse
import json
import sys
from datetime import datetime
from pathlib import Path
import numpy as np
import pandas as pd
import yaml
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
PROJECT_ROOT = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(PROJECT_ROOT))
from src.data.dual_loader import load_dual_track_data
from src.evaluation.golden_baseline_report import write_golden_baseline_report
from src.evaluation.threshold_tuning import predict_with_threshold
from src.models.hybrid_ensemble import (
evaluate_ensemble,
save_ensemble_meta,
soft_vote_probs,
tune_ensemble_threshold,
)
from src.models.metadata_lr import MetadataLRModel
from src.models.transformer_trainer import (
evaluate_pretrained_bert_baseline,
train_transformer_stable,
)
from src.pipeline.run_hybrid_clean_pipeline import (
_branch_metrics,
_build_hf_dataset,
_json_safe,
_meta_frame,
)
from src.pipeline.run_performance_push_pipeline import infer_train_probs
from src.utils.logger import get_logger
logger = get_logger(__name__)
DEFAULT_CONFIG = PROJECT_ROOT / "configs" / "golden_baseline_training.yaml"
def run_golden_baseline_pipeline(*, config_path: Path | None = None) -> dict:
cfg_path = config_path or DEFAULT_CONFIG
cfg = yaml.safe_load(open(cfg_path))
rand = int(cfg["pipeline"]["random_state"])
test_size = float(cfg["pipeline"]["test_size"])
val_size = float(cfg["pipeline"]["val_size"])
max_gap = float(cfg["pipeline"].get("max_train_test_gap", 0.05))
baseline_gap_target = float(cfg["pipeline"].get("baseline_gap_target", 0.01))
squeeze_gap_target = float(cfg["pipeline"].get("squeeze_gap_target", 0.049))
target_f1 = float(cfg["pipeline"].get("target_f1_weighted", 0.80))
target_col = cfg["data"]["target_binary"]
text_col = cfg["data"].get("text_column", "Text")
out_cfg = cfg["output"]
reports_dir = PROJECT_ROOT / out_cfg["reports_dir"]
bert_dir = PROJECT_ROOT / out_cfg["transformer_dir"]
lr_path = PROJECT_ROOT / out_cfg["lr_path"]
meta_path = PROJECT_ROOT / out_cfg["ensemble_meta_path"]
reports_dir.mkdir(parents=True, exist_ok=True)
run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
logger.info("=" * 60)
logger.info(f"GOLDEN BASELINE STRATEGY β€” run={run_id}")
logger.info("=" * 60)
df = load_dual_track_data(
PROJECT_ROOT / cfg["data"]["raw_path"],
processed_preprocessed=cfg["data"]["processed_preprocessed"],
processed_stats=cfg["data"]["processed_stats"],
target=target_col,
text_column=text_col,
id_column=cfg["data"].get("id_column", "CommentId"),
features_config=cfg["data"]["features_config"],
project_root=PROJECT_ROOT,
write_preprocessed_if_missing=False,
)
y = df[target_col].astype(int)
idx_trainval, idx_test = train_test_split(
df.index, test_size=test_size, random_state=rand, stratify=y
)
y_trainval = y.loc[idx_trainval]
idx_train, idx_val = train_test_split(
idx_trainval, test_size=val_size, random_state=rand, stratify=y_trainval
)
def _slice(index):
return {
"raw": df.loc[index, text_col],
"clean": df.loc[index, "clean_text"],
"meta": _meta_frame(df.loc[index]),
"y": y.loc[index],
}
tr, va, te = _slice(idx_train), _slice(idx_val), _slice(idx_test)
y_train_orig = y.loc[idx_train]
y_test_arr = te["y"].astype(int).values
y_val_arr = va["y"].astype(int).values
hf_train = _build_hf_dataset(tr["raw"], tr["y"])
hf_val = _build_hf_dataset(va["raw"], va["y"])
hf_test = _build_hf_dataset(te["raw"], te["y"])
all_metrics: dict = {
"run_id": run_id,
"pipeline": cfg.get("pipeline", {}).get("name", "golden_baseline"),
"config": str(cfg_path),
"target_f1_weighted": target_f1,
"max_train_test_gap_pp": max_gap * 100,
"baseline_gap_target_pp": baseline_gap_target * 100,
"squeeze_gap_target_pp": squeeze_gap_target * 100,
"augmentation": {"enabled": False},
}
# ── Step 1: Golden Baseline (frozen pretrained, no training) ─────────────
logger.info("Step 1 β€” Golden Baseline (all layers frozen, zero fine-tuning)")
base_label = cfg.get("baseline", {}).get("model_label", "Golden-Baseline-Toxic-BERT")
baseline_result = evaluate_pretrained_bert_baseline(
hf_train,
hf_val,
hf_test,
y_train_orig.values,
y_test_arr,
y_val_arr,
cfg,
seed=rand,
model_label=base_label,
)
base_m = baseline_result["metrics"]
base_m["esencial_gap_ok"] = base_m.get("train_test_gap", 1) < baseline_gap_target
base_m["gap_ok"] = base_m["esencial_gap_ok"]
all_metrics["golden_baseline"] = base_m
logger.info(
f" Baseline F1w={base_m['f1_weighted']} gap_pp={base_m['train_test_gap_pp']} "
f"{'βœ…' if base_m['esencial_gap_ok'] else '⚠️'}"
)
# ── Step 2: Performance Squeeze (last 2 layers + R-Drop) ───────────────
tr_cfg = cfg.get("transformer", {})
squeeze_label = tr_cfg.get("model_label", "Performance-Squeeze-Toxic-BERT")
logger.info(
f"Step 2 β€” Performance Squeeze (last {tr_cfg.get('train_last_n_layers', 2)} layers, "
f"R-Drop, lr={tr_cfg.get('learning_rate', 5e-6)}, max_epochs={tr_cfg.get('max_epochs', 15)})"
)
squeeze_result = train_transformer_stable(
hf_train,
hf_val,
hf_test,
y_test_arr,
y_val_arr,
cfg,
bert_dir,
seed=rand,
model_label=squeeze_label,
)
squeeze_m = squeeze_result["metrics"]
squeeze_m["squeeze_gap_ok"] = squeeze_m.get("train_test_gap", 1) <= squeeze_gap_target
squeeze_m["target_f1_hit"] = squeeze_m.get("f1_weighted", 0) >= target_f1
all_metrics["performance_squeeze"] = squeeze_m
bert_val_probs = squeeze_result["val_probs"]
bert_test_probs = squeeze_result["test_probs"]
bert_train_probs = infer_train_probs(squeeze_result, tr["raw"])
# ── Step 3: Hybrid Safety Net (C=0.001, 200 features) ────────────────────
lr_cfg = cfg["logistic_regression"]
tfidf_cfg = lr_cfg.get("tfidf", {})
logger.info("Step 3 β€” Hybrid Safety Net (LR C=0.001, max_features=200)")
lr_model = MetadataLRModel(lr_cfg, tfidf_cfg)
lr_model.fit(tr["clean"], tr["meta"], tr["y"])
lr_model.save(lr_path)
lr_val_probs = lr_model.predict_proba(va["clean"], va["meta"])[:, 1]
lr_test_probs = lr_model.predict_proba(te["clean"], te["meta"])[:, 1]
lr_train_probs = lr_model.predict_proba(tr["clean"], tr["meta"])[:, 1]
lr_metrics = _branch_metrics(
tr["y"],
te["y"],
va["y"],
lr_train_probs,
lr_val_probs,
lr_test_probs,
model_name="LR-Safety-Net",
)
all_metrics["logistic_regression"] = lr_metrics
ens_cfg = cfg["ensemble"]
bw = float(ens_cfg.get("bert_weight", 0.90))
lw = float(ens_cfg.get("lr_weight", 0.10))
th_cfg = ens_cfg.get("threshold_tuning", {})
if th_cfg.get("enabled", True):
ens_threshold, val_f1_at_t = tune_ensemble_threshold(
bert_val_probs,
lr_val_probs,
y_val_arr,
bert_weight=bw,
lr_weight=lw,
metric=th_cfg.get("metric", "f1_weighted"),
min_threshold=float(th_cfg.get("min_threshold", 0.30)),
max_threshold=float(th_cfg.get("max_threshold", 0.70)),
step=float(th_cfg.get("step", 0.01)),
)
else:
ens_threshold = 0.5
val_f1_at_t = None
hybrid_metrics = evaluate_ensemble(
bert_test_probs,
lr_test_probs,
y_test_arr,
bert_weight=bw,
lr_weight=lw,
model_name="Hybrid-Safety-Net",
threshold=ens_threshold,
)
ens_train_probs = soft_vote_probs(bert_train_probs, lr_train_probs, bw, lw)
ens_train_preds = predict_with_threshold(ens_train_probs, ens_threshold)
y_tr_arr = tr["y"].astype(int).values
f1_train_ens = float(f1_score(y_tr_arr, ens_train_preds, average="weighted", zero_division=0))
gap_ens = abs(f1_train_ens - hybrid_metrics["f1_weighted"])
hybrid_metrics["f1_train"] = round(f1_train_ens, 4)
hybrid_metrics["train_test_gap"] = round(gap_ens, 4)
hybrid_metrics["train_test_gap_pp"] = round(gap_ens * 100, 2)
hybrid_metrics["gap_ok"] = gap_ens <= max_gap
hybrid_metrics["bert_weight"] = bw
hybrid_metrics["lr_weight"] = lw
hybrid_metrics["val_f1_weighted_at_threshold"] = (
round(val_f1_at_t, 4) if val_f1_at_t else None
)
hybrid_metrics["target_f1_hit"] = hybrid_metrics["f1_weighted"] >= target_f1
hybrid_metrics["briefing_compliant"] = hybrid_metrics["gap_ok"] and hybrid_metrics["target_f1_hit"]
all_metrics["hybrid_safety_net"] = hybrid_metrics
save_ensemble_meta(
meta_path,
{
"run_id": run_id,
"bert_dir": str(bert_dir),
"lr_path": str(lr_path),
"weights": {"bert": bw, "lr": lw, "fixed": True},
"thresholds": {
"bert": squeeze_m.get("threshold"),
"lr": lr_metrics.get("threshold"),
"ensemble": ens_threshold,
},
},
)
report_path = reports_dir / f"golden_baseline_run_{run_id}.json"
with open(report_path, "w") as f:
json.dump(_json_safe(all_metrics), f, indent=2)
md_path = reports_dir / f"integrated_report_{run_id}.md"
write_golden_baseline_report(all_metrics, md_path)
logger.info(f"Report: {md_path}")
_print_summary(all_metrics, target_f1, max_gap * 100, baseline_gap_target * 100)
return all_metrics
def _print_summary(
metrics: dict, target: float, max_gap_pp: float, baseline_gap_pp: float
) -> None:
logger.info("=" * 60)
base = metrics.get("golden_baseline", {})
hybrid = metrics.get("hybrid_safety_net", {})
logger.info(
f"BASELINE F1w={base.get('f1_weighted')} gap_pp={base.get('train_test_gap_pp')} "
f"({'βœ… <1%' if base.get('esencial_gap_ok') else '⚠️'})"
)
logger.info(
f"HYBRID F1w={hybrid.get('f1_weighted')} gap_pp={hybrid.get('train_test_gap_pp')} "
f"({'βœ… F1+gap' if hybrid.get('briefing_compliant') else '⚠️ below target'})"
)
logger.info("=" * 60)
def main():
parser = argparse.ArgumentParser(description="Golden Baseline strategy pipeline")
parser.add_argument("--config", type=str, default=None)
args = parser.parse_args()
run_golden_baseline_pipeline(
config_path=Path(args.config) if args.config else None,
)
if __name__ == "__main__":
main()