CausalGrok / code /experiments /mechinterp_m6_neuron_ablation.py

Upload code/experiments

50fa85c verified 17 days ago

17.4 kB

	"""M6 — Neuron-level Ablation (the textbook reviewer-asked intervention).

	Pipeline:
	1. At a chosen checkpoint (default: peak_ood_epoch), extract avgpool
	features for train (H0-H2) and OOD (H4) splits.
	2. Score each of the 512 avgpool channels by *how predictive its activation
	is of hospital ID*: we use a one-vs-rest logistic-regression coefficient
	per channel × class as the per-neuron shortcut score:
	score_c = max_h \|β_{h,c}\| (β = coefficients of LR fit per channel)
	↑ score_c → channel c is more strongly stain-shortcut-aligned.
	3. Sweep top-K ∈ {0, 8, 16, 32, 64, 128} ablated neurons (zero out their
	activations) and measure:
	- head OOD acc (raw vs ablated)
	- hospital-probe acc on raw vs ablated features
	- tumor-probe acc on raw vs ablated features
	4. Strong mechanistic claim:
	- hospital-probe acc collapses sharply with K (these neurons are
	carrying hospital info)
	- head OOD acc improves (or at least preserves) at small K (the
	model was using shortcut neurons to harm OOD)
	- tumor-probe acc stays flat (causal info is distributed elsewhere)

	Usage
	-----
	python -m experiments.mechinterp_m6_neuron_ablation \\
	--run_dir experiments/runs/<id> \\
	--data_root data/wilds \\
	[--epoch 50] [--max_samples 1000] \\
	[--ks "0,4,8,16,32,64,128,256"]
	"""
	from __future__ import annotations

	import argparse
	import json
	from pathlib import Path
	from typing import Dict, List

	import matplotlib
	matplotlib.use("Agg")
	import matplotlib.pyplot as plt
	import numpy as np
	import torch
	from sklearn.linear_model import LogisticRegression
	from sklearn.preprocessing import StandardScaler

	from torch.utils.data import DataLoader, Subset
	from torchvision import transforms

	from experiments.mechinterp_m1 import (
	register_hooks, extract_features, load_model_from_checkpoint,
	)
	from experiments.mechinterp_m4_ablation import (
	_select_epoch, _TransformWrapper,
	_classifier_logits_from_features, _accuracy,
	)
	from utils.camelyon_data import get_camelyon_subsets


	def _build_loaders_with_id(data_root: str, max_samples: int, seed: int = 42):
	"""Like M4's _build_loaders but also returns an ID validation loader so
	we can track ID acc and compute the OOD/ID degradation ratio."""
	transform = transforms.Compose([
	transforms.Resize((96, 96)),
	transforms.ToTensor(),
	transforms.Normalize(mean=[0.485, 0.456, 0.406],
	std=[0.229, 0.224, 0.225]),
	])
	train_ds, id_val_ds, ood_test_ds, _ = get_camelyon_subsets(
	root_dir=data_root, download=False
	)
	train_t = _TransformWrapper(train_ds, transform)
	id_t = _TransformWrapper(id_val_ds, transform)
	ood_t = _TransformWrapper(ood_test_ds, transform)

	torch.manual_seed(seed)
	train_idx = torch.randperm(len(train_t))[:max_samples]
	id_idx = torch.randperm(len(id_t))[:max_samples // 2]
	ood_idx = torch.randperm(len(ood_t))[:max_samples // 2]

	train_loader = DataLoader(Subset(train_t, train_idx), batch_size=128,
	shuffle=False, num_workers=0)
	id_loader = DataLoader(Subset(id_t, id_idx), batch_size=128,
	shuffle=False, num_workers=0)
	ood_loader = DataLoader(Subset(ood_t, ood_idx), batch_size=128,
	shuffle=False, num_workers=0)
	return train_loader, id_loader, ood_loader


	def _per_neuron_shortcut_scores(X_n: np.ndarray, hosp: np.ndarray) -> np.ndarray:
	"""Return a (D,) array — score per channel c, larger = more hospital-predictive.

	Uses a 1-feature-at-a-time log-reg fit's \|coef\| would be dominated by feature
	scale; instead we fit a single multiclass LR over all features and use the
	L2 norm of (β_{:,c}) — the column norm of the LR coefficient matrix —
	as channel c's hospital-discrimination score.
	"""
	clf = LogisticRegression(max_iter=500, C=1.0, solver="lbfgs",
	multi_class="auto", n_jobs=-1).fit(X_n, hosp)
	W = clf.coef_ # (n_classes, D)
	# column norms — large means many class-discriminations rely on this neuron
	return np.linalg.norm(W, axis=0) # (D,)


	def _ablate_and_eval(
	X_n, mask, scaler, head_target, model, layer, device,
	hosp_clf, tumor_clf, hosp_target, tumor_target,
	):
	"""Apply mask to normalized features, unscale, evaluate everything."""
	X_ablated_n = X_n * mask[None, :]
	X_ablated = scaler.inverse_transform(X_ablated_n)
	logits = _classifier_logits_from_features(model, X_ablated, layer, device)
	head_acc = _accuracy(logits, head_target)
	hosp_acc = hosp_clf.score(X_ablated_n, hosp_target) if hosp_clf is not None and len(np.unique(hosp_target)) > 1 else float("nan")
	tumor_acc = tumor_clf.score(X_ablated_n, tumor_target)
	return head_acc, hosp_acc, tumor_acc


	def run_neuron_ablation(
	run_dir: Path,
	data_root: str,
	epoch: int \| None = None,
	max_samples: int = 1000,
	device: str = "cuda",
	ks: List[int] = None,
	n_random_samples: int = 5,
	include_morphology: bool = True,
	include_id: bool = True,
	) -> Dict:
	if ks is None:
	# Dose-response curve emphasizing small K (per reviewer guidance)
	ks = [0, 4, 8, 16, 32, 64, 128, 256]

	epoch, ckpt_path = _select_epoch(run_dir, epoch)

	print(f"\n M6 — Neuron Ablation (with random + morphology controls)")
	print(f" run_dir : {run_dir.name}")
	print(f" epoch : {epoch} ({ckpt_path.name})")
	print(f" ks : {ks}")
	print(f" random ablation: {n_random_samples} samplings per K")

	model = load_model_from_checkpoint(str(ckpt_path), n_classes=2, device=device)
	model.eval()
	register_hooks(model)

	cfg_path = run_dir / "config.json"
	seed = 42
	if cfg_path.exists():
	seed = json.loads(cfg_path.read_text()).get("seed", 42)

	if include_id:
	train_loader, id_loader, ood_loader = _build_loaders_with_id(data_root, max_samples, seed=seed)
	else:
	from experiments.mechinterp_m4_ablation import _build_loaders as _bl
	train_loader, ood_loader = _bl(data_root, max_samples, seed=seed)
	id_loader = None

	print(f" Extracting features (train + id + ood)...")
	feats_train, hosp_train, tumor_train = extract_features(
	model, train_loader, device, max_samples=max_samples
	)
	feats_ood, hosp_ood, tumor_ood = extract_features(
	model, ood_loader, device, max_samples=max_samples // 2
	)
	feats_id, hosp_id, tumor_id = (None, None, None)
	if id_loader is not None:
	feats_id, hosp_id, tumor_id = extract_features(
	model, id_loader, device, max_samples=max_samples // 2
	)

	layer = "avgpool"
	def _to_2d(arr):
	a = np.asarray(arr); return a.reshape(a.shape[0], -1)
	X_tr = _to_2d(feats_train[layer])
	X_ood = _to_2d(feats_ood[layer])
	X_id = _to_2d(feats_id[layer]) if feats_id is not None else None
	hosp_train = np.asarray(hosp_train); hosp_ood = np.asarray(hosp_ood)
	tumor_train = np.asarray(tumor_train); tumor_ood = np.asarray(tumor_ood)
	if X_id is not None:
	hosp_id = np.asarray(hosp_id); tumor_id = np.asarray(tumor_id)

	scaler = StandardScaler().fit(X_tr)
	X_tr_n = scaler.transform(X_tr)
	X_ood_n = scaler.transform(X_ood)
	X_id_n = scaler.transform(X_id) if X_id is not None else None

	# 1. Per-neuron scores: shortcut (hospital) and morphology (tumor)
	print(f" Scoring {X_tr.shape[1]} avgpool channels...")
	shortcut_scores = _per_neuron_shortcut_scores(X_tr_n, hosp_train)
	morphology_scores = _per_neuron_shortcut_scores(X_tr_n, tumor_train) if include_morphology else None
	rank_shortcut = np.argsort(-shortcut_scores)
	rank_morphology = np.argsort(-morphology_scores) if morphology_scores is not None else None

	hosp_clf = LogisticRegression(max_iter=500, C=1.0, solver="lbfgs",
	multi_class="auto", n_jobs=-1).fit(X_tr_n, hosp_train)
	tumor_clf = LogisticRegression(max_iter=500, C=1.0, solver="lbfgs",
	multi_class="auto", n_jobs=-1).fit(X_tr_n, tumor_train)

	rng = np.random.default_rng(seed)
	D = X_tr.shape[1]

	sweep = []
	for k in ks:
	row = {"k": int(k)}

	# Mask helpers
	def make_mask(indices):
	m = np.ones(D)
	if k > 0:
	m[indices[:k]] = 0.0
	return m

	# ── A: top-K SHORTCUT neurons (the targeted ablation) ──
	mask_s = make_mask(rank_shortcut)
	h_ood, hp_ood, tp_ood = _ablate_and_eval(
	X_ood_n, mask_s, scaler, tumor_ood, model, layer, device,
	hosp_clf, tumor_clf, hosp_ood, tumor_ood,
	)
	row["shortcut_head_ood"] = float(h_ood)
	row["shortcut_hosp_probe"] = float(hp_ood)
	row["shortcut_tumor_probe"] = float(tp_ood)
	if X_id_n is not None:
	h_id, _, _ = _ablate_and_eval(
	X_id_n, mask_s, scaler, tumor_id, model, layer, device,
	None, tumor_clf, hosp_id, tumor_id,
	)
	row["shortcut_head_id"] = float(h_id)

	# ── B: top-K MORPHOLOGY neurons (control: ablate the causal neurons) ──
	if include_morphology and rank_morphology is not None:
	mask_m = make_mask(rank_morphology)
	h_ood_m, _, _ = _ablate_and_eval(
	X_ood_n, mask_m, scaler, tumor_ood, model, layer, device,
	None, tumor_clf, hosp_ood, tumor_ood,
	)
	row["morphology_head_ood"] = float(h_ood_m)
	if X_id_n is not None:
	h_id_m, _, _ = _ablate_and_eval(
	X_id_n, mask_m, scaler, tumor_id, model, layer, device,
	None, tumor_clf, hosp_id, tumor_id,
	)
	row["morphology_head_id"] = float(h_id_m)

	# ── C: K RANDOM neurons (control: damage uniformly) ──
	if k > 0:
	r_oods, r_ids = [], []
	for s_ in range(n_random_samples):
	idx = rng.permutation(D)[:k]
	m = np.ones(D); m[idx] = 0.0
	h_ood_r, _, _ = _ablate_and_eval(
	X_ood_n, m, scaler, tumor_ood, model, layer, device,
	None, tumor_clf, hosp_ood, tumor_ood,
	)
	r_oods.append(h_ood_r)
	if X_id_n is not None:
	h_id_r, _, _ = _ablate_and_eval(
	X_id_n, m, scaler, tumor_id, model, layer, device,
	None, tumor_clf, hosp_id, tumor_id,
	)
	r_ids.append(h_id_r)
	row["random_head_ood_mean"] = float(np.mean(r_oods))
	row["random_head_ood_std"] = float(np.std(r_oods))
	if r_ids:
	row["random_head_id_mean"] = float(np.mean(r_ids))
	row["random_head_id_std"] = float(np.std(r_ids))
	else:
	row["random_head_ood_mean"] = row["shortcut_head_ood"] # K=0 same as baseline
	row["random_head_ood_std"] = 0.0
	if X_id_n is not None:
	row["random_head_id_mean"] = row.get("shortcut_head_id", float("nan"))
	row["random_head_id_std"] = 0.0

	sweep.append(row)
	# Concise log line
	print(f" K={k:>4} shortcut={row['shortcut_head_ood']:.3f} "
	f"random={row.get('random_head_ood_mean', float('nan')):.3f}±"
	f"{row.get('random_head_ood_std', 0):.3f} "
	+ (f"morphology={row.get('morphology_head_ood', float('nan')):.3f}"
	if include_morphology else ""))

	return {
	"run_id": run_dir.name,
	"epoch": epoch,
	"layer": layer,
	"max_samples": max_samples,
	"feature_dim": int(X_tr.shape[1]),
	"shortcut_scores_top10": [int(i) for i in rank_shortcut[:10]],
	"morphology_scores_top10": ([int(i) for i in rank_morphology[:10]]
	if rank_morphology is not None else []),
	"n_random_samples": n_random_samples,
	"include_id": include_id,
	"include_morphology": include_morphology,
	"sweep": sweep,
	}


	def plot_neuron_ablation(result: Dict, out_path: Path):
	sweep = result["sweep"]
	ks = [r["k"] for r in sweep]

	has_id = result.get("include_id", False)
	has_morph = result.get("include_morphology", False)

	fig, axes = plt.subplots(1, 2 if has_id else 1, figsize=(13, 5)) if has_id else \
	plt.subplots(1, 1, figsize=(8, 5))
	if not has_id:
	axes = [axes]

	# Panel A — Head OOD: shortcut vs random (vs morphology)
	ax = axes[0]
	shortcut_ood = [r.get("shortcut_head_ood") for r in sweep]
	random_ood_mu = [r.get("random_head_ood_mean") for r in sweep]
	random_ood_sd = [r.get("random_head_ood_std", 0) for r in sweep]
	morphology_ood = [r.get("morphology_head_ood") for r in sweep] if has_morph else None

	ax.plot(ks, shortcut_ood, "r-o", lw=2.2, ms=7, label="top-K shortcut neurons (targeted)")
	ax.plot(ks, random_ood_mu, "k-s", lw=1.8, ms=6, label="K random neurons (control)")
	ax.fill_between(ks,
	[m - s for m, s in zip(random_ood_mu, random_ood_sd)],
	[m + s for m, s in zip(random_ood_mu, random_ood_sd)],
	color="black", alpha=0.15)
	if has_morph and morphology_ood is not None:
	ax.plot(ks, morphology_ood, "g-^", lw=1.8, ms=6,
	label="top-K morphology neurons (control)")

	base = shortcut_ood[0]
	ax.axhline(base, color="gray", ls=":", lw=1, alpha=0.5,
	label=f"K=0 baseline ({base:.3f})")
	ax.set_xlabel("K (neurons zeroed at avgpool)")
	ax.set_ylabel("Head OOD (H4) accuracy")
	ax.set_xscale("symlog", linthresh=4)
	ax.set_title("Targeted vs random ablation — OOD effect\n"
	"(separation = shortcut neurons selectively hurt OOD)",
	fontweight="bold", fontsize=10)
	ax.legend(loc="best", fontsize=8); ax.grid(alpha=0.3)

	# Panel B — ID/OOD tradeoff
	if has_id:
	ax = axes[1]
	shortcut_id = [r.get("shortcut_head_id") for r in sweep]
	random_id_mu = [r.get("random_head_id_mean") for r in sweep]
	random_id_sd = [r.get("random_head_id_std", 0) for r in sweep]
	ax.plot(ks, shortcut_id, "r--o", lw=2, ms=7, alpha=0.85, label="ID (shortcut ablation)")
	ax.plot(ks, shortcut_ood, "r-o", lw=2, ms=7, label="OOD (shortcut ablation)")
	ax.plot(ks, random_id_mu, "k--s", lw=1.6, ms=5, alpha=0.7, label="ID (random ablation)")
	ax.plot(ks, random_ood_mu, "k-s", lw=1.6, ms=5, alpha=0.7, label="OOD (random ablation)")
	ax.set_xlabel("K (neurons zeroed at avgpool)")
	ax.set_ylabel("Head accuracy")
	ax.set_xscale("symlog", linthresh=4)
	ax.set_title("ID vs OOD degradation tradeoff\n"
	"(targeted: OOD steady or ↑ while ID slowly ↓ = good)",
	fontweight="bold", fontsize=10)
	ax.legend(fontsize=8, loc="best"); ax.grid(alpha=0.3)

	fig.suptitle(f"M6 — Targeted Neuron Ablation vs Random Control: {result['run_id']} "
	f"• ep{result['epoch']}",
	fontsize=11, fontweight="bold")
	plt.tight_layout()
	fig.savefig(out_path, dpi=180, bbox_inches="tight")
	plt.close(fig)


	def main():
	p = argparse.ArgumentParser()
	p.add_argument("--run_dir", required=True)
	p.add_argument("--data_root", default="data/wilds")
	p.add_argument("--epoch", type=int, default=None)
	p.add_argument("--max_samples", type=int, default=1000)
	p.add_argument("--device", default="cuda")
	p.add_argument("--ks", default=None,
	help="Comma-separated K values, e.g. '0,4,8,16,32,64,128,256'")
	p.add_argument("--n_random_samples", type=int, default=5,
	help="Random ablation: averages over this many random K-subsets")
	p.add_argument("--no_morphology", action="store_true",
	help="Skip the morphology-targeted ablation control")
	p.add_argument("--no_id", action="store_true",
	help="Skip ID accuracy evaluation (faster but loses ID/OOD ratio)")
	args = p.parse_args()

	ks = None
	if args.ks is not None:
	ks = [int(x) for x in args.ks.split(",")]

	run_dir = Path(args.run_dir)
	out_dir = run_dir / "mechinterp"
	out_dir.mkdir(parents=True, exist_ok=True)

	result = run_neuron_ablation(
	run_dir=run_dir, data_root=args.data_root, epoch=args.epoch,
	max_samples=args.max_samples, device=args.device, ks=ks,
	n_random_samples=args.n_random_samples,
	include_morphology=not args.no_morphology,
	include_id=not args.no_id,
	)
	base = out_dir / f"m6_neuron_ablation_ep{result['epoch']:05d}"
	base.with_suffix(".json").write_text(json.dumps(result, indent=2))
	plot_neuron_ablation(result, base.with_suffix(".png"))
	print(f"\n → {base.with_suffix('.json')}")
	print(f" → {base.with_suffix('.png')}")


	if __name__ == "__main__":
	main()