| """ |
| Golden Baseline strategy β frozen pretrained BERT, R-Drop squeeze, hybrid safety net. |
| |
| uv run python -m src.pipeline.run_golden_baseline_pipeline |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import sys |
| from datetime import datetime |
| from pathlib import Path |
|
|
| import numpy as np |
| import pandas as pd |
| import yaml |
| from sklearn.metrics import f1_score |
| from sklearn.model_selection import train_test_split |
|
|
| PROJECT_ROOT = Path(__file__).resolve().parents[2] |
| sys.path.insert(0, str(PROJECT_ROOT)) |
|
|
| from src.data.dual_loader import load_dual_track_data |
| from src.evaluation.golden_baseline_report import write_golden_baseline_report |
| from src.evaluation.threshold_tuning import predict_with_threshold |
| from src.models.hybrid_ensemble import ( |
| evaluate_ensemble, |
| save_ensemble_meta, |
| soft_vote_probs, |
| tune_ensemble_threshold, |
| ) |
| from src.models.metadata_lr import MetadataLRModel |
| from src.models.transformer_trainer import ( |
| evaluate_pretrained_bert_baseline, |
| train_transformer_stable, |
| ) |
| from src.pipeline.run_hybrid_clean_pipeline import ( |
| _branch_metrics, |
| _build_hf_dataset, |
| _json_safe, |
| _meta_frame, |
| ) |
| from src.pipeline.run_performance_push_pipeline import infer_train_probs |
| from src.utils.logger import get_logger |
|
|
| logger = get_logger(__name__) |
| DEFAULT_CONFIG = PROJECT_ROOT / "configs" / "golden_baseline_training.yaml" |
|
|
|
|
| def run_golden_baseline_pipeline(*, config_path: Path | None = None) -> dict: |
| cfg_path = config_path or DEFAULT_CONFIG |
| cfg = yaml.safe_load(open(cfg_path)) |
|
|
| rand = int(cfg["pipeline"]["random_state"]) |
| test_size = float(cfg["pipeline"]["test_size"]) |
| val_size = float(cfg["pipeline"]["val_size"]) |
| max_gap = float(cfg["pipeline"].get("max_train_test_gap", 0.05)) |
| baseline_gap_target = float(cfg["pipeline"].get("baseline_gap_target", 0.01)) |
| squeeze_gap_target = float(cfg["pipeline"].get("squeeze_gap_target", 0.049)) |
| target_f1 = float(cfg["pipeline"].get("target_f1_weighted", 0.80)) |
| target_col = cfg["data"]["target_binary"] |
| text_col = cfg["data"].get("text_column", "Text") |
|
|
| out_cfg = cfg["output"] |
| reports_dir = PROJECT_ROOT / out_cfg["reports_dir"] |
| bert_dir = PROJECT_ROOT / out_cfg["transformer_dir"] |
| lr_path = PROJECT_ROOT / out_cfg["lr_path"] |
| meta_path = PROJECT_ROOT / out_cfg["ensemble_meta_path"] |
| reports_dir.mkdir(parents=True, exist_ok=True) |
|
|
| run_id = datetime.now().strftime("%Y%m%d_%H%M%S") |
| logger.info("=" * 60) |
| logger.info(f"GOLDEN BASELINE STRATEGY β run={run_id}") |
| logger.info("=" * 60) |
|
|
| df = load_dual_track_data( |
| PROJECT_ROOT / cfg["data"]["raw_path"], |
| processed_preprocessed=cfg["data"]["processed_preprocessed"], |
| processed_stats=cfg["data"]["processed_stats"], |
| target=target_col, |
| text_column=text_col, |
| id_column=cfg["data"].get("id_column", "CommentId"), |
| features_config=cfg["data"]["features_config"], |
| project_root=PROJECT_ROOT, |
| write_preprocessed_if_missing=False, |
| ) |
|
|
| y = df[target_col].astype(int) |
| idx_trainval, idx_test = train_test_split( |
| df.index, test_size=test_size, random_state=rand, stratify=y |
| ) |
| y_trainval = y.loc[idx_trainval] |
| idx_train, idx_val = train_test_split( |
| idx_trainval, test_size=val_size, random_state=rand, stratify=y_trainval |
| ) |
|
|
| def _slice(index): |
| return { |
| "raw": df.loc[index, text_col], |
| "clean": df.loc[index, "clean_text"], |
| "meta": _meta_frame(df.loc[index]), |
| "y": y.loc[index], |
| } |
|
|
| tr, va, te = _slice(idx_train), _slice(idx_val), _slice(idx_test) |
| y_train_orig = y.loc[idx_train] |
| y_test_arr = te["y"].astype(int).values |
| y_val_arr = va["y"].astype(int).values |
|
|
| hf_train = _build_hf_dataset(tr["raw"], tr["y"]) |
| hf_val = _build_hf_dataset(va["raw"], va["y"]) |
| hf_test = _build_hf_dataset(te["raw"], te["y"]) |
|
|
| all_metrics: dict = { |
| "run_id": run_id, |
| "pipeline": cfg.get("pipeline", {}).get("name", "golden_baseline"), |
| "config": str(cfg_path), |
| "target_f1_weighted": target_f1, |
| "max_train_test_gap_pp": max_gap * 100, |
| "baseline_gap_target_pp": baseline_gap_target * 100, |
| "squeeze_gap_target_pp": squeeze_gap_target * 100, |
| "augmentation": {"enabled": False}, |
| } |
|
|
| |
| logger.info("Step 1 β Golden Baseline (all layers frozen, zero fine-tuning)") |
| base_label = cfg.get("baseline", {}).get("model_label", "Golden-Baseline-Toxic-BERT") |
| baseline_result = evaluate_pretrained_bert_baseline( |
| hf_train, |
| hf_val, |
| hf_test, |
| y_train_orig.values, |
| y_test_arr, |
| y_val_arr, |
| cfg, |
| seed=rand, |
| model_label=base_label, |
| ) |
| base_m = baseline_result["metrics"] |
| base_m["esencial_gap_ok"] = base_m.get("train_test_gap", 1) < baseline_gap_target |
| base_m["gap_ok"] = base_m["esencial_gap_ok"] |
| all_metrics["golden_baseline"] = base_m |
| logger.info( |
| f" Baseline F1w={base_m['f1_weighted']} gap_pp={base_m['train_test_gap_pp']} " |
| f"{'β
' if base_m['esencial_gap_ok'] else 'β οΈ'}" |
| ) |
|
|
| |
| tr_cfg = cfg.get("transformer", {}) |
| squeeze_label = tr_cfg.get("model_label", "Performance-Squeeze-Toxic-BERT") |
| logger.info( |
| f"Step 2 β Performance Squeeze (last {tr_cfg.get('train_last_n_layers', 2)} layers, " |
| f"R-Drop, lr={tr_cfg.get('learning_rate', 5e-6)}, max_epochs={tr_cfg.get('max_epochs', 15)})" |
| ) |
| squeeze_result = train_transformer_stable( |
| hf_train, |
| hf_val, |
| hf_test, |
| y_test_arr, |
| y_val_arr, |
| cfg, |
| bert_dir, |
| seed=rand, |
| model_label=squeeze_label, |
| ) |
| squeeze_m = squeeze_result["metrics"] |
| squeeze_m["squeeze_gap_ok"] = squeeze_m.get("train_test_gap", 1) <= squeeze_gap_target |
| squeeze_m["target_f1_hit"] = squeeze_m.get("f1_weighted", 0) >= target_f1 |
| all_metrics["performance_squeeze"] = squeeze_m |
|
|
| bert_val_probs = squeeze_result["val_probs"] |
| bert_test_probs = squeeze_result["test_probs"] |
| bert_train_probs = infer_train_probs(squeeze_result, tr["raw"]) |
|
|
| |
| lr_cfg = cfg["logistic_regression"] |
| tfidf_cfg = lr_cfg.get("tfidf", {}) |
| logger.info("Step 3 β Hybrid Safety Net (LR C=0.001, max_features=200)") |
| lr_model = MetadataLRModel(lr_cfg, tfidf_cfg) |
| lr_model.fit(tr["clean"], tr["meta"], tr["y"]) |
| lr_model.save(lr_path) |
|
|
| lr_val_probs = lr_model.predict_proba(va["clean"], va["meta"])[:, 1] |
| lr_test_probs = lr_model.predict_proba(te["clean"], te["meta"])[:, 1] |
| lr_train_probs = lr_model.predict_proba(tr["clean"], tr["meta"])[:, 1] |
|
|
| lr_metrics = _branch_metrics( |
| tr["y"], |
| te["y"], |
| va["y"], |
| lr_train_probs, |
| lr_val_probs, |
| lr_test_probs, |
| model_name="LR-Safety-Net", |
| ) |
| all_metrics["logistic_regression"] = lr_metrics |
|
|
| ens_cfg = cfg["ensemble"] |
| bw = float(ens_cfg.get("bert_weight", 0.90)) |
| lw = float(ens_cfg.get("lr_weight", 0.10)) |
| th_cfg = ens_cfg.get("threshold_tuning", {}) |
| if th_cfg.get("enabled", True): |
| ens_threshold, val_f1_at_t = tune_ensemble_threshold( |
| bert_val_probs, |
| lr_val_probs, |
| y_val_arr, |
| bert_weight=bw, |
| lr_weight=lw, |
| metric=th_cfg.get("metric", "f1_weighted"), |
| min_threshold=float(th_cfg.get("min_threshold", 0.30)), |
| max_threshold=float(th_cfg.get("max_threshold", 0.70)), |
| step=float(th_cfg.get("step", 0.01)), |
| ) |
| else: |
| ens_threshold = 0.5 |
| val_f1_at_t = None |
|
|
| hybrid_metrics = evaluate_ensemble( |
| bert_test_probs, |
| lr_test_probs, |
| y_test_arr, |
| bert_weight=bw, |
| lr_weight=lw, |
| model_name="Hybrid-Safety-Net", |
| threshold=ens_threshold, |
| ) |
| ens_train_probs = soft_vote_probs(bert_train_probs, lr_train_probs, bw, lw) |
| ens_train_preds = predict_with_threshold(ens_train_probs, ens_threshold) |
| y_tr_arr = tr["y"].astype(int).values |
| f1_train_ens = float(f1_score(y_tr_arr, ens_train_preds, average="weighted", zero_division=0)) |
| gap_ens = abs(f1_train_ens - hybrid_metrics["f1_weighted"]) |
| hybrid_metrics["f1_train"] = round(f1_train_ens, 4) |
| hybrid_metrics["train_test_gap"] = round(gap_ens, 4) |
| hybrid_metrics["train_test_gap_pp"] = round(gap_ens * 100, 2) |
| hybrid_metrics["gap_ok"] = gap_ens <= max_gap |
| hybrid_metrics["bert_weight"] = bw |
| hybrid_metrics["lr_weight"] = lw |
| hybrid_metrics["val_f1_weighted_at_threshold"] = ( |
| round(val_f1_at_t, 4) if val_f1_at_t else None |
| ) |
| hybrid_metrics["target_f1_hit"] = hybrid_metrics["f1_weighted"] >= target_f1 |
| hybrid_metrics["briefing_compliant"] = hybrid_metrics["gap_ok"] and hybrid_metrics["target_f1_hit"] |
| all_metrics["hybrid_safety_net"] = hybrid_metrics |
|
|
| save_ensemble_meta( |
| meta_path, |
| { |
| "run_id": run_id, |
| "bert_dir": str(bert_dir), |
| "lr_path": str(lr_path), |
| "weights": {"bert": bw, "lr": lw, "fixed": True}, |
| "thresholds": { |
| "bert": squeeze_m.get("threshold"), |
| "lr": lr_metrics.get("threshold"), |
| "ensemble": ens_threshold, |
| }, |
| }, |
| ) |
|
|
| report_path = reports_dir / f"golden_baseline_run_{run_id}.json" |
| with open(report_path, "w") as f: |
| json.dump(_json_safe(all_metrics), f, indent=2) |
|
|
| md_path = reports_dir / f"integrated_report_{run_id}.md" |
| write_golden_baseline_report(all_metrics, md_path) |
| logger.info(f"Report: {md_path}") |
|
|
| _print_summary(all_metrics, target_f1, max_gap * 100, baseline_gap_target * 100) |
| return all_metrics |
|
|
|
|
| def _print_summary( |
| metrics: dict, target: float, max_gap_pp: float, baseline_gap_pp: float |
| ) -> None: |
| logger.info("=" * 60) |
| base = metrics.get("golden_baseline", {}) |
| hybrid = metrics.get("hybrid_safety_net", {}) |
| logger.info( |
| f"BASELINE F1w={base.get('f1_weighted')} gap_pp={base.get('train_test_gap_pp')} " |
| f"({'β
<1%' if base.get('esencial_gap_ok') else 'β οΈ'})" |
| ) |
| logger.info( |
| f"HYBRID F1w={hybrid.get('f1_weighted')} gap_pp={hybrid.get('train_test_gap_pp')} " |
| f"({'β
F1+gap' if hybrid.get('briefing_compliant') else 'β οΈ below target'})" |
| ) |
| logger.info("=" * 60) |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Golden Baseline strategy pipeline") |
| parser.add_argument("--config", type=str, default=None) |
| args = parser.parse_args() |
| run_golden_baseline_pipeline( |
| config_path=Path(args.config) if args.config else None, |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|