pivot: code + trained checkpoints (norman, replogle k562)

3b4941f verified 1 day ago

9.73 kB

	"""figures for the paper, helvetica-ish font (nimbus sans).
	figure 1: effect-axis embedding panels (control / observed / pivot-predicted).
	figure 2: quantitative results (forward bars, gears head-to-head, dist-loss, reward)."""
	import sys, os, glob, json
	sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
	import numpy as np
	import matplotlib
	matplotlib.use("Agg")
	from matplotlib import font_manager as fm
	import matplotlib.pyplot as plt
	from matplotlib.patches import Patch
	from matplotlib.lines import Line2D

	# helvetica-family font (nimbus sans = urw helvetica clone)
	for patt in ["/usr/share/fonts/opentype/urw-base35/NimbusSans-*.otf",
	"/usr/share/fonts/truetype/liberation2/LiberationSans-*.ttf"]:
	for f in glob.glob(patt):
	try: fm.fontManager.addfont(f)
	except Exception: pass
	plt.rcParams.update({
	"font.family": "sans-serif",
	"font.sans-serif": ["Nimbus Sans", "Helvetica", "Arial", "Liberation Sans", "DejaVu Sans"],
	"mathtext.fontset": "dejavusans",
	"font.size": 11, "axes.labelsize": 12, "axes.titlesize": 12.5,
	"xtick.labelsize": 10, "ytick.labelsize": 10, "legend.fontsize": 10,
	"axes.linewidth": 0.9, "axes.edgecolor": "#444444",
	"xtick.color": "#444444", "ytick.color": "#444444",
	"axes.labelcolor": "#222222", "text.color": "#222222",
	"figure.dpi": 150, "savefig.dpi": 320, "savefig.bbox": "tight",
	"axes.spines.top": False, "axes.spines.right": False,
	"axes.grid": True, "grid.color": "#E6E8EB", "grid.linewidth": 0.8,
	"axes.axisbelow": True, "legend.frameon": False,
	})
	# palette
	C_CTRL, C_OBS, C_PRED, C_PIVOT, C_BASE, C_ACC = "#B9C2CC", "#2D6FB3", "#E4572E", "#1B9E77", "#9AA6B2", "#6A4C93"
	FIG = "figures"
	RES = "experiments/results"
	os.makedirs(FIG, exist_ok=True)
	def L(n): return json.load(open(f"{RES}/{n}.json"))
	def save(fig, name):
	fig.savefig(f"{FIG}/{name}.png"); fig.savefig(f"{FIG}/{name}.pdf"); plt.close(fig)
	print("wrote", name)


	def kde_contour(ax, X, color, levels=4):
	try:
	from scipy.stats import gaussian_kde
	if len(X) < 10: return
	k = gaussian_kde(X.T)
	xmin, ymin = X.min(0); xmax, ymax = X.max(0)
	xs, ys = np.mgrid[xmin:xmax:80j, ymin:ymax:80j]
	z = k(np.vstack([xs.ravel(), ys.ravel()])).reshape(xs.shape)
	ax.contour(xs, ys, z, levels=levels, colors=color, linewidths=1.0, alpha=0.55)
	except Exception:
	pass


	def figure1(model, data, targets, device):
	"""effect-axis panels: x = projection on (target-control) direction, y = orthogonal pc."""
	from src.evaluation import inference as inf
	import torch
	rng = np.random.default_rng(0)
	ctrl_idx = data.control_idx
	cmean = data.emb[ctrl_idx].mean(0)
	c0e = torch.as_tensor(data.emb[rng.choice(ctrl_idx, 400, replace=False)], dtype=torch.float32, device=device)
	n = len(targets)
	fig, axes = plt.subplots(1, n, figsize=(4.3 * n, 4.1))
	if n == 1: axes = [axes]
	for ax, p in zip(axes, targets):
	ti = data.pert_to_idx[p]
	tmean = data.emb[ti].mean(0)
	d = tmean - cmean; d = d / (np.linalg.norm(d) + 1e-9) # effect direction
	# orthogonal axis = top pc of perturbed cells with effect-dir removed
	Y = data.emb[ti] - data.emb[ti].mean(0)
	Y = Y - np.outer(Y @ d, d)
	u, s, vt = np.linalg.svd(Y, full_matrices=False); o = vt[0]
	def proj(M): return np.c_[(M - cmean) @ d, (M - cmean) @ o]
	ci = rng.choice(ctrl_idx, 500, replace=False)
	Pc, Po = proj(data.emb[ci]), proj(data.emb[ti])
	e = inf.encode_label(model, data, p, device)
	Ppred = proj(inf.forward_predict(model, c0e, e).cpu().numpy())
	ax.scatter(Pc[:, 0], Pc[:, 1], s=9, c=C_CTRL, alpha=0.7, linewidths=0, label="control", rasterized=True)
	ax.scatter(Po[:, 0], Po[:, 1], s=11, c=C_OBS, alpha=0.55, linewidths=0, label="observed perturbed", rasterized=True)
	ax.scatter(Ppred[:, 0], Ppred[:, 1], s=22, marker="X", c=C_PRED, alpha=0.9,
	edgecolors="white", linewidths=0.4, label="PIVOT predicted")
	kde_contour(ax, Po, C_OBS); kde_contour(ax, Ppred, C_PRED)
	# arrow control-centroid -> observed-centroid (the transport)
	cc, oc = proj(cmean[None])[0], proj(tmean[None])[0]
	ax.annotate("", xy=(oc[0], oc[1]), xytext=(cc[0], cc[1]),
	arrowprops=dict(arrowstyle="-\|>", color="#333333", lw=1.6, alpha=0.8))
	ax.set_title(p.replace("_", "+"), fontweight="bold")
	ax.set_xlabel("effect axis"); ax.set_ylabel("orthogonal axis")
	ax.tick_params(length=0)
	handles = [Line2D([], [], marker='o', ls='', mfc=C_CTRL, mec='none', ms=7, label='control'),
	Line2D([], [], marker='o', ls='', mfc=C_OBS, mec='none', ms=7, label='observed perturbed'),
	Line2D([], [], marker='X', ls='', mfc=C_PRED, mec='white', ms=8, label='PIVOT predicted')]
	fig.legend(handles=handles, loc="lower center", ncol=3, bbox_to_anchor=(0.5, -0.04))
	fig.suptitle("Control cells transported toward the perturbed population", y=1.02,
	fontsize=13, fontweight="bold")
	fig.tight_layout()
	save(fig, "fig1_embedding_panels")


	def figure2_results():
	fig, ax = plt.subplots(2, 2, figsize=(11, 7.4))
	# (a) forward de-corr across methods (held-out perturbation), from benchmark
	bf = L("norman_benchmark")["forward"]
	order = ["PIVOT", "LinearResponse", "kNN-latent", "Additive", "NearestPerturbationCentroid",
	"ConditionalMLP", "EndpointMLP", "AvgPerturbationEffect", "MeanControl", "Random"]
	pretty = {"PIVOT": "PIVOT", "LinearResponse": "Linear", "kNN-latent": "kNN-latent",
	"Additive": "Additive", "NearestPerturbationCentroid": "Nearest centroid",
	"ConditionalMLP": "Conditional MLP", "EndpointMLP": "Endpoint MLP",
	"AvgPerturbationEffect": "Avg. effect", "MeanControl": "Mean control", "Random": "Random"}
	vals = [(pretty[m], bf[m]["de_corr"]) for m in order if m in bf]
	vals.sort(key=lambda kv: kv[1])
	names = [v[0] for v in vals]; de = [v[1] for v in vals]
	cols = [C_PIVOT if n == "PIVOT" else C_BASE for n in names]
	a = ax[0, 0]; a.barh(names, de, color=cols, edgecolor="white", height=0.74)
	for i, v in enumerate(de): a.text(v + 0.01, i, f"{v:.2f}", va="center", fontsize=9, color="#333")
	a.set_xlim(0, 1.0); a.set_xlabel("DE correlation $\\uparrow$"); a.grid(axis="y", visible=False)
	a.set_title("a Forward direction, held-out perturbations", loc="left", fontweight="bold")

	# (b) gears head-to-head
	pg = L("pivot_vs_gears")
	b = ax[0, 1]
	bars = b.bar(["PIVOT", "GEARS"], [pg["pivot_pearson_de_expr"], pg["gears_pearson_de_expr"]],
	color=[C_PIVOT, C_ACC], edgecolor="white", width=0.55)
	for r, v in zip(bars, [pg["pivot_pearson_de_expr"], pg["gears_pearson_de_expr"]]):
	b.text(r.get_x() + r.get_width()/2, v + 0.012, f"{v:.3f}", ha="center", fontsize=11, fontweight="bold")
	b.set_ylim(0, 1.08); b.set_ylabel("Top-20 DE-gene Pearson $\\uparrow$"); b.grid(axis="x", visible=False)
	b.set_title("b Head-to-head vs GEARS (matched perts)", loc="left", fontweight="bold")

	# (c) distributional loss: mmd down, de-corr preserved
	dl = L("norman_distloss")["rows"]
	lam = sorted(float(k) for k in dl); mmd = [dl[str(l) if str(l) in dl else f"{l:.1f}"]["mmd"] for l in lam]
	de2 = [dl[str(l) if str(l) in dl else f"{l:.1f}"]["de_corr"] for l in lam]
	c = ax[1, 0]
	c.plot(lam, mmd, "o-", color=C_PRED, lw=2.2, ms=7, label="MMD $\\downarrow$")
	c.set_xlabel("distributional-loss weight $\\lambda_{\\mathrm{dist}}$"); c.set_ylabel("population MMD $\\downarrow$", color=C_PRED)
	c.tick_params(axis="y", colors=C_PRED); c.set_ylim(0, max(mmd)*1.15)
	c2 = c.twinx(); c2.plot(lam, de2, "s--", color=C_OBS, lw=2.0, ms=6, label="DE-corr $\\uparrow$")
	c2.set_ylabel("DE correlation $\\uparrow$", color=C_OBS); c2.tick_params(axis="y", colors=C_OBS)
	c2.set_ylim(0.5, 0.95); c2.grid(False); c2.spines["top"].set_visible(False)
	c.set_title("c Distributional flow loss: MMD $6\\times$ lower, direction kept", loc="left", fontweight="bold")

	# (d) reward ablation top-5
	rw = L("norman_ablation_reward")["rows"]
	rmap = {"centroid": "Centroid", "nn_target": "NN-target", "mmd": "MMD", "wasserstein": "Wasserstein", "cosine": "Cosine"}
	rn = [rmap.get(k, k) for k in rw]; t5 = [rw[k]["top5"] for k in rw]
	cols2 = [C_PIVOT if rmap.get(k) == "Cosine" else C_BASE for k in rw]
	d = ax[1, 1]; bars = d.bar(rn, t5, color=cols2, edgecolor="white", width=0.66)
	for r, v in zip(bars, t5): d.text(r.get_x()+r.get_width()/2, v+0.006, f"{v:.2f}", ha="center", fontsize=9.5)
	d.set_ylabel("nomination Top-5 $\\uparrow$"); d.set_ylim(0, max(t5)*1.25); d.grid(axis="x", visible=False)
	d.tick_params(axis="x", rotation=18)
	d.set_title("d Direction-aware reward wins at nomination", loc="left", fontweight="bold")
	fig.tight_layout()
	save(fig, "fig2_results")


	if __name__ == "__main__":
	from src.data.perturb_data import load_dataset
	from src.training.train import TrainConfig, train
	data = load_dataset("norman")
	gpu = int(os.environ.get("PIVOT_GPU", "3"))
	cfg = TrainConfig(dataset="norman", split="cell", epochs=60, device_index=gpu)
	model, info = train(cfg, data=data, verbose=False)
	dev = next(model.parameters()).device
	singles = [p for p in data.perturbations if len(data.parse(p)) == 1]
	combos = [p for p in data.perturbations if len(data.parse(p)) == 2]
	figure1(model, data, [singles[0], singles[7], combos[0]], dev)
	figure2_results()
	print("FIGURES_V2_DONE")