Spaces:

devrup404
/

SignalMod

Sleeping

SignalMod / src /pipeline /run_golden_baseline_pipeline.py

Mirae Kang

feat: implement new models and improve UI, #23

46cc63a 5 days ago

11.1 kB

	"""
	Golden Baseline strategy — frozen pretrained BERT, R-Drop squeeze, hybrid safety net.

	uv run python -m src.pipeline.run_golden_baseline_pipeline
	"""

	from __future__ import annotations

	import argparse
	import json
	import sys
	from datetime import datetime
	from pathlib import Path

	import numpy as np
	import pandas as pd
	import yaml
	from sklearn.metrics import f1_score
	from sklearn.model_selection import train_test_split

	PROJECT_ROOT = Path(__file__).resolve().parents[2]
	sys.path.insert(0, str(PROJECT_ROOT))

	from src.data.dual_loader import load_dual_track_data
	from src.evaluation.golden_baseline_report import write_golden_baseline_report
	from src.evaluation.threshold_tuning import predict_with_threshold
	from src.models.hybrid_ensemble import (
	evaluate_ensemble,
	save_ensemble_meta,
	soft_vote_probs,
	tune_ensemble_threshold,
	)
	from src.models.metadata_lr import MetadataLRModel
	from src.models.transformer_trainer import (
	evaluate_pretrained_bert_baseline,
	train_transformer_stable,
	)
	from src.pipeline.run_hybrid_clean_pipeline import (
	_branch_metrics,
	_build_hf_dataset,
	_json_safe,
	_meta_frame,
	)
	from src.pipeline.run_performance_push_pipeline import infer_train_probs
	from src.utils.logger import get_logger

	logger = get_logger(__name__)
	DEFAULT_CONFIG = PROJECT_ROOT / "configs" / "golden_baseline_training.yaml"


	def run_golden_baseline_pipeline(*, config_path: Path \| None = None) -> dict:
	cfg_path = config_path or DEFAULT_CONFIG
	cfg = yaml.safe_load(open(cfg_path))

	rand = int(cfg["pipeline"]["random_state"])
	test_size = float(cfg["pipeline"]["test_size"])
	val_size = float(cfg["pipeline"]["val_size"])
	max_gap = float(cfg["pipeline"].get("max_train_test_gap", 0.05))
	baseline_gap_target = float(cfg["pipeline"].get("baseline_gap_target", 0.01))
	squeeze_gap_target = float(cfg["pipeline"].get("squeeze_gap_target", 0.049))
	target_f1 = float(cfg["pipeline"].get("target_f1_weighted", 0.80))
	target_col = cfg["data"]["target_binary"]
	text_col = cfg["data"].get("text_column", "Text")

	out_cfg = cfg["output"]
	reports_dir = PROJECT_ROOT / out_cfg["reports_dir"]
	bert_dir = PROJECT_ROOT / out_cfg["transformer_dir"]
	lr_path = PROJECT_ROOT / out_cfg["lr_path"]
	meta_path = PROJECT_ROOT / out_cfg["ensemble_meta_path"]
	reports_dir.mkdir(parents=True, exist_ok=True)

	run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
	logger.info("=" * 60)
	logger.info(f"GOLDEN BASELINE STRATEGY — run={run_id}")
	logger.info("=" * 60)

	df = load_dual_track_data(
	PROJECT_ROOT / cfg["data"]["raw_path"],
	processed_preprocessed=cfg["data"]["processed_preprocessed"],
	processed_stats=cfg["data"]["processed_stats"],
	target=target_col,
	text_column=text_col,
	id_column=cfg["data"].get("id_column", "CommentId"),
	features_config=cfg["data"]["features_config"],
	project_root=PROJECT_ROOT,
	write_preprocessed_if_missing=False,
	)

	y = df[target_col].astype(int)
	idx_trainval, idx_test = train_test_split(
	df.index, test_size=test_size, random_state=rand, stratify=y
	)
	y_trainval = y.loc[idx_trainval]
	idx_train, idx_val = train_test_split(
	idx_trainval, test_size=val_size, random_state=rand, stratify=y_trainval
	)

	def _slice(index):
	return {
	"raw": df.loc[index, text_col],
	"clean": df.loc[index, "clean_text"],
	"meta": _meta_frame(df.loc[index]),
	"y": y.loc[index],
	}

	tr, va, te = _slice(idx_train), _slice(idx_val), _slice(idx_test)
	y_train_orig = y.loc[idx_train]
	y_test_arr = te["y"].astype(int).values
	y_val_arr = va["y"].astype(int).values

	hf_train = _build_hf_dataset(tr["raw"], tr["y"])
	hf_val = _build_hf_dataset(va["raw"], va["y"])
	hf_test = _build_hf_dataset(te["raw"], te["y"])

	all_metrics: dict = {
	"run_id": run_id,
	"pipeline": cfg.get("pipeline", {}).get("name", "golden_baseline"),
	"config": str(cfg_path),
	"target_f1_weighted": target_f1,
	"max_train_test_gap_pp": max_gap * 100,
	"baseline_gap_target_pp": baseline_gap_target * 100,
	"squeeze_gap_target_pp": squeeze_gap_target * 100,
	"augmentation": {"enabled": False},
	}

	# ── Step 1: Golden Baseline (frozen pretrained, no training) ─────────────
	logger.info("Step 1 — Golden Baseline (all layers frozen, zero fine-tuning)")
	base_label = cfg.get("baseline", {}).get("model_label", "Golden-Baseline-Toxic-BERT")
	baseline_result = evaluate_pretrained_bert_baseline(
	hf_train,
	hf_val,
	hf_test,
	y_train_orig.values,
	y_test_arr,
	y_val_arr,
	cfg,
	seed=rand,
	model_label=base_label,
	)
	base_m = baseline_result["metrics"]
	base_m["esencial_gap_ok"] = base_m.get("train_test_gap", 1) < baseline_gap_target
	base_m["gap_ok"] = base_m["esencial_gap_ok"]
	all_metrics["golden_baseline"] = base_m
	logger.info(
	f" Baseline F1w={base_m['f1_weighted']} gap_pp={base_m['train_test_gap_pp']} "
	f"{'✅' if base_m['esencial_gap_ok'] else '⚠️'}"
	)

	# ── Step 2: Performance Squeeze (last 2 layers + R-Drop) ───────────────
	tr_cfg = cfg.get("transformer", {})
	squeeze_label = tr_cfg.get("model_label", "Performance-Squeeze-Toxic-BERT")
	logger.info(
	f"Step 2 — Performance Squeeze (last {tr_cfg.get('train_last_n_layers', 2)} layers, "
	f"R-Drop, lr={tr_cfg.get('learning_rate', 5e-6)}, max_epochs={tr_cfg.get('max_epochs', 15)})"
	)
	squeeze_result = train_transformer_stable(
	hf_train,
	hf_val,
	hf_test,
	y_test_arr,
	y_val_arr,
	cfg,
	bert_dir,
	seed=rand,
	model_label=squeeze_label,
	)
	squeeze_m = squeeze_result["metrics"]
	squeeze_m["squeeze_gap_ok"] = squeeze_m.get("train_test_gap", 1) <= squeeze_gap_target
	squeeze_m["target_f1_hit"] = squeeze_m.get("f1_weighted", 0) >= target_f1
	all_metrics["performance_squeeze"] = squeeze_m

	bert_val_probs = squeeze_result["val_probs"]
	bert_test_probs = squeeze_result["test_probs"]
	bert_train_probs = infer_train_probs(squeeze_result, tr["raw"])

	# ── Step 3: Hybrid Safety Net (C=0.001, 200 features) ────────────────────
	lr_cfg = cfg["logistic_regression"]
	tfidf_cfg = lr_cfg.get("tfidf", {})
	logger.info("Step 3 — Hybrid Safety Net (LR C=0.001, max_features=200)")
	lr_model = MetadataLRModel(lr_cfg, tfidf_cfg)
	lr_model.fit(tr["clean"], tr["meta"], tr["y"])
	lr_model.save(lr_path)

	lr_val_probs = lr_model.predict_proba(va["clean"], va["meta"])[:, 1]
	lr_test_probs = lr_model.predict_proba(te["clean"], te["meta"])[:, 1]
	lr_train_probs = lr_model.predict_proba(tr["clean"], tr["meta"])[:, 1]

	lr_metrics = _branch_metrics(
	tr["y"],
	te["y"],
	va["y"],
	lr_train_probs,
	lr_val_probs,
	lr_test_probs,
	model_name="LR-Safety-Net",
	)
	all_metrics["logistic_regression"] = lr_metrics

	ens_cfg = cfg["ensemble"]
	bw = float(ens_cfg.get("bert_weight", 0.90))
	lw = float(ens_cfg.get("lr_weight", 0.10))
	th_cfg = ens_cfg.get("threshold_tuning", {})
	if th_cfg.get("enabled", True):
	ens_threshold, val_f1_at_t = tune_ensemble_threshold(
	bert_val_probs,
	lr_val_probs,
	y_val_arr,
	bert_weight=bw,
	lr_weight=lw,
	metric=th_cfg.get("metric", "f1_weighted"),
	min_threshold=float(th_cfg.get("min_threshold", 0.30)),
	max_threshold=float(th_cfg.get("max_threshold", 0.70)),
	step=float(th_cfg.get("step", 0.01)),
	)
	else:
	ens_threshold = 0.5
	val_f1_at_t = None

	hybrid_metrics = evaluate_ensemble(
	bert_test_probs,
	lr_test_probs,
	y_test_arr,
	bert_weight=bw,
	lr_weight=lw,
	model_name="Hybrid-Safety-Net",
	threshold=ens_threshold,
	)
	ens_train_probs = soft_vote_probs(bert_train_probs, lr_train_probs, bw, lw)
	ens_train_preds = predict_with_threshold(ens_train_probs, ens_threshold)
	y_tr_arr = tr["y"].astype(int).values
	f1_train_ens = float(f1_score(y_tr_arr, ens_train_preds, average="weighted", zero_division=0))
	gap_ens = abs(f1_train_ens - hybrid_metrics["f1_weighted"])
	hybrid_metrics["f1_train"] = round(f1_train_ens, 4)
	hybrid_metrics["train_test_gap"] = round(gap_ens, 4)
	hybrid_metrics["train_test_gap_pp"] = round(gap_ens * 100, 2)
	hybrid_metrics["gap_ok"] = gap_ens <= max_gap
	hybrid_metrics["bert_weight"] = bw
	hybrid_metrics["lr_weight"] = lw
	hybrid_metrics["val_f1_weighted_at_threshold"] = (
	round(val_f1_at_t, 4) if val_f1_at_t else None
	)
	hybrid_metrics["target_f1_hit"] = hybrid_metrics["f1_weighted"] >= target_f1
	hybrid_metrics["briefing_compliant"] = hybrid_metrics["gap_ok"] and hybrid_metrics["target_f1_hit"]
	all_metrics["hybrid_safety_net"] = hybrid_metrics

	save_ensemble_meta(
	meta_path,
	{
	"run_id": run_id,
	"bert_dir": str(bert_dir),
	"lr_path": str(lr_path),
	"weights": {"bert": bw, "lr": lw, "fixed": True},
	"thresholds": {
	"bert": squeeze_m.get("threshold"),
	"lr": lr_metrics.get("threshold"),
	"ensemble": ens_threshold,
	},
	},
	)

	report_path = reports_dir / f"golden_baseline_run_{run_id}.json"
	with open(report_path, "w") as f:
	json.dump(_json_safe(all_metrics), f, indent=2)

	md_path = reports_dir / f"integrated_report_{run_id}.md"
	write_golden_baseline_report(all_metrics, md_path)
	logger.info(f"Report: {md_path}")

	_print_summary(all_metrics, target_f1, max_gap * 100, baseline_gap_target * 100)
	return all_metrics


	def _print_summary(
	metrics: dict, target: float, max_gap_pp: float, baseline_gap_pp: float
	) -> None:
	logger.info("=" * 60)
	base = metrics.get("golden_baseline", {})
	hybrid = metrics.get("hybrid_safety_net", {})
	logger.info(
	f"BASELINE F1w={base.get('f1_weighted')} gap_pp={base.get('train_test_gap_pp')} "
	f"({'✅ <1%' if base.get('esencial_gap_ok') else '⚠️'})"
	)
	logger.info(
	f"HYBRID F1w={hybrid.get('f1_weighted')} gap_pp={hybrid.get('train_test_gap_pp')} "
	f"({'✅ F1+gap' if hybrid.get('briefing_compliant') else '⚠️ below target'})"
	)
	logger.info("=" * 60)


	def main():
	parser = argparse.ArgumentParser(description="Golden Baseline strategy pipeline")
	parser.add_argument("--config", type=str, default=None)
	args = parser.parse_args()
	run_golden_baseline_pipeline(
	config_path=Path(args.config) if args.config else None,
	)


	if __name__ == "__main__":
	main()