| """Generate the full figures_v2 paper figure set. |
| |
| Usage: |
| python figures_v2/scripts/make_all_figures.py --package-root . |
| """ |
| from __future__ import annotations |
|
|
| import argparse |
| import math |
| from pathlib import Path |
|
|
| import matplotlib.pyplot as plt |
| import numpy as np |
| import pandas as pd |
| from matplotlib.lines import Line2D |
| from matplotlib.patches import FancyArrowPatch, FancyBboxPatch, Rectangle |
|
|
| from data_loaders import ( |
| high_order_summary, |
| inventory_files, |
| load_dataset_degrees, |
| load_manual_metrics, |
| load_npy, |
| numeric_bucket_key, |
| path_exists, |
| pr_curve, |
| read_table, |
| rw_ensemble_metrics, |
| ) |
| from figure_specs import FIGURE_SPECS |
| from plot_style import ( |
| ANNOTATION_SIZE, |
| BAR_EDGEWIDTH, |
| COLORS, |
| DOUBLE_COL, |
| LABEL_SIZE, |
| LEGEND_SIZE, |
| LINEWIDTH, |
| MARKER_SIZE, |
| SINGLE_COL, |
| SUBTITLE_SIZE, |
| TICK_SIZE, |
| apply_style, |
| output_dirs, |
| panel_label, |
| save_all, |
| ) |
|
|
|
|
| def ccdf(values: np.ndarray) -> tuple[np.ndarray, np.ndarray, float]: |
| v = np.asarray(values) |
| v = np.sort(v[v > 0]) |
| if len(v) == 0: |
| return np.array([1]), np.array([1.0]), 0.0 |
| x, counts = np.unique(v, return_counts=True) |
| cum = np.cumsum(counts) |
| y = (len(v) - cum + counts) / len(v) |
| if len(x) > 1: |
| slope, _ = np.polyfit(np.log10(x), np.log10(y), 1) |
| else: |
| slope = 0.0 |
| return x, y, float(slope) |
|
|
|
|
| def result(key: str, status: str, files: list[str], sources: list[str], limitation: str = "") -> dict: |
| spec = FIGURE_SPECS[key] |
| return { |
| "key": key, |
| "status": status, |
| "files": files, |
| "sources": sources, |
| "purpose": spec["purpose"], |
| "caption": spec["caption"], |
| "paper_position": spec["paper_position"], |
| "main_text": spec["main_text"], |
| "known_limitations": limitation, |
| } |
|
|
|
|
| def make_fig1(root: Path, dirs: dict[str, Path]) -> dict: |
| key = "fig1_task_graph" |
| fig, ax = plt.subplots(figsize=(DOUBLE_COL, 3.35), constrained_layout=True) |
| ax.set_xlim(0, 12) |
| ax.set_ylim(0, 6) |
| ax.axis("off") |
|
|
| def group(x, y, w, h, label, fc, ec): |
| box = FancyBboxPatch((x, y), w, h, boxstyle="round,pad=0.05,rounding_size=0.08", fc=fc, ec=ec, lw=0.9) |
| ax.add_patch(box) |
| ax.text(x + w / 2, y + h - 0.28, label, ha="center", va="center", fontsize=LABEL_SIZE, color=ec) |
|
|
| group(0.25, 1.35, 3.05, 3.85, "Authors", COLORS["pale_blue"], COLORS["blue"]) |
| group(8.65, 1.35, 3.10, 3.85, "Papers", COLORS["pale_green"], COLORS["green"]) |
|
|
| authors = {"A1": (1.05, 4.35), "A2": (2.25, 4.15), "A3": (1.15, 2.65), "A4": (2.35, 2.45)} |
| papers = {"P1": (9.45, 4.35), "P2": (10.75, 4.15), "P3": (9.55, 2.65), "P4": (10.85, 2.45)} |
|
|
| for label, (x, y) in authors.items(): |
| ax.scatter([x], [y], s=260, color=COLORS["blue"], edgecolor="white", linewidth=0.8, zorder=4) |
| ax.text(x, y, label, ha="center", va="center", fontsize=ANNOTATION_SIZE, color="white", fontweight="bold", zorder=5) |
| for label, (x, y) in papers.items(): |
| ax.add_patch(Rectangle((x - 0.27, y - 0.22), 0.54, 0.44, fc=COLORS["green"], ec="white", lw=0.8, zorder=4)) |
| ax.text(x, y, label, ha="center", va="center", fontsize=ANNOTATION_SIZE, color="white", fontweight="bold", zorder=5) |
|
|
| def edge(a, b, color, style="-", lw=1.2, arrow=False, rad=0.0): |
| patch = FancyArrowPatch( |
| a, |
| b, |
| arrowstyle="-|>" if arrow else "-", |
| mutation_scale=9, |
| connectionstyle=f"arc3,rad={rad}", |
| color=color, |
| lw=lw, |
| linestyle=style, |
| alpha=0.95, |
| zorder=2, |
| ) |
| ax.add_patch(patch) |
|
|
| edge(authors["A1"], papers["P1"], COLORS["blue"], "-", 1.15, False, 0.05) |
| edge(authors["A2"], papers["P2"], COLORS["blue"], "-", 1.15, False, -0.04) |
| edge(authors["A3"], papers["P3"], COLORS["blue"], "-", 1.15, False, 0.02) |
| edge(authors["A1"], authors["A2"], COLORS["orange"], "--", 1.1, False, 0.10) |
| edge(authors["A3"], authors["A4"], COLORS["orange"], "--", 1.1, False, -0.10) |
| edge(papers["P1"], papers["P2"], COLORS["red"], ":", 1.3, True, -0.10) |
| edge(papers["P3"], papers["P4"], COLORS["red"], ":", 1.3, True, 0.10) |
| edge(authors["A4"], papers["P3"], COLORS["dark"], "-", 2.2, False, -0.12) |
|
|
| ax.text( |
| 6.0, |
| 3.22, |
| "test pair: rank score -> 0/1", |
| ha="center", |
| va="center", |
| fontsize=ANNOTATION_SIZE, |
| color=COLORS["dark"], |
| bbox={"boxstyle": "round,pad=0.18", "fc": "white", "ec": "none", "alpha": 0.92}, |
| ) |
| ax.text( |
| 6.0, |
| 0.63, |
| "6,611 authors; 79,937 papers; 2,047,262 test pairs; metric: F1", |
| ha="center", |
| fontsize=ANNOTATION_SIZE, |
| color=COLORS["gray"], |
| ) |
|
|
| handles = [ |
| Line2D([0], [0], color=COLORS["blue"], lw=1.4, label="author-paper"), |
| Line2D([0], [0], color=COLORS["orange"], lw=1.4, ls="--", label="coauthor"), |
| Line2D([0], [0], color=COLORS["red"], lw=1.4, ls=":", label="citation"), |
| Line2D([0], [0], color=COLORS["dark"], lw=2.0, label="test pair"), |
| ] |
| ax.legend(handles=handles, loc="upper center", bbox_to_anchor=(0.5, 0.08), ncol=4, handlelength=2.0, columnspacing=1.0) |
| files = save_all(fig, key, dirs) |
| return result(key, "ok", files, ["schematic; README.md; data_and_docs/dataset.md"]) |
|
|
|
|
| def make_fig2(root: Path, dirs: dict[str, Path]) -> dict: |
| key = "fig2_dataset_sparsity" |
| data = load_dataset_degrees(root) |
| if data is None: |
| return result(key, "skipped", [], ["data_and_docs edge files"], "One or more official edge files are missing.") |
| fig, axes = plt.subplots(2, 2, figsize=(DOUBLE_COL, 5.0), constrained_layout=True) |
| panels = [ |
| (axes[0, 0], data["coauthor_degree"], COLORS["blue"], "(a) Coauthor degree"), |
| (axes[0, 1], data["citation_indegree"], COLORS["red"], "(b) Citation in-degree"), |
| (axes[1, 0], data["paper_read_degree"], COLORS["green"], "(c) Author-paper degree"), |
| ] |
| for ax, deg, color, title in panels: |
| x, y, slope = ccdf(deg) |
| ax.loglog(x, y, ".", color=color, markersize=2.6, alpha=0.85) |
| if len(x) > 1: |
| lx = np.log10(x) |
| a, b = np.polyfit(lx, np.log10(y), 1) |
| xs = np.linspace(lx.min(), lx.max(), 60) |
| ax.loglog(10**xs, 10 ** (a * xs + b), color=color, lw=1.0, alpha=0.75) |
| ax.set_title(title) |
| ax.set_xlabel("degree k") |
| ax.set_ylabel("P(deg >= k)") |
| ax.text(0.64, 0.84, f"slope {slope:.2f}", transform=ax.transAxes, fontsize=ANNOTATION_SIZE, color=color) |
|
|
| ax = axes[1, 1] |
| bins = np.arange(1, 9) |
| width = 0.34 |
| author = data["author_read_degree"] |
| paper = data["paper_read_degree"] |
| author_frac = [(author[author > 0] == k).mean() for k in bins] |
| paper_frac = [(paper[paper > 0] == k).mean() for k in bins] |
| ax.bar(bins - width / 2, author_frac, width=width, color=COLORS["blue"], edgecolor="white", linewidth=BAR_EDGEWIDTH, label="authors") |
| ax.bar(bins + width / 2, paper_frac, width=width, color=COLORS["green"], edgecolor="white", linewidth=BAR_EDGEWIDTH, label="papers") |
| ax.set_title("(d) Low-degree mass") |
| ax.set_xlabel("degree k") |
| ax.set_ylabel("fraction") |
| ax.set_xticks(bins) |
| ax.legend(loc="upper right") |
| ax.text(0.05, 0.86, "cold-start mass", transform=ax.transAxes, fontsize=ANNOTATION_SIZE, color=COLORS["gray"]) |
| files = save_all(fig, key, dirs) |
| return result( |
| key, |
| "ok", |
| files, |
| [ |
| "data_and_docs/author_file_ann.txt", |
| "data_and_docs/paper_file_ann.txt", |
| "data_and_docs/bipartite_train_ann.txt", |
| ], |
| ) |
|
|
|
|
| def make_fig3(root: Path, dirs: dict[str, Path]) -> dict: |
| key = "fig3_performance_evolution" |
| df = load_manual_metrics(root).sort_values("order") |
| x = np.arange(len(df)) |
| fig, ax = plt.subplots(figsize=(DOUBLE_COL, 3.6), constrained_layout=True) |
| ax.axvspan(-0.4, 1.5, color=COLORS["light_gray"], alpha=0.06) |
| ax.axvspan(1.5, 4.5, color=COLORS["orange"], alpha=0.05) |
| ax.axvspan(4.5, 6.4, color=COLORS["green"], alpha=0.05) |
| ax.plot(x, df["val_f1"], "-o", color=COLORS["blue"], label="validation F1", markersize=MARKER_SIZE) |
| public = df["public_f1"].to_numpy(dtype=float) |
| mask = ~np.isnan(public) |
| ax.plot(x[mask], public[mask], "--s", color=COLORS["red"], markerfacecolor="white", label="public F1", markersize=MARKER_SIZE) |
| ax.set_xticks(x) |
| ax.set_xticklabels(df["label"], fontsize=TICK_SIZE) |
| ax.set_ylabel("F1-score") |
| ax.set_ylim(0.875, 0.972) |
| ax.set_title("Performance evolution") |
| ax.legend(loc="lower right") |
|
|
| annotations = [(1, 2, "+0.0174 graph stack"), (4, 5, "+0.0028 RW blocks"), (5, 6, "+0.0020 high-order")] |
| for left, right, text in annotations: |
| y = min(float(df.loc[df["order"].eq(left), "val_f1"].iloc[0]), float(df.loc[df["order"].eq(right), "val_f1"].iloc[0])) - 0.006 |
| ax.annotate("", xy=(right, y), xytext=(left, y), arrowprops={"arrowstyle": "<->", "lw": 0.8, "color": COLORS["gray"]}) |
| ax.text((left + right) / 2, y - 0.0045, text, ha="center", va="top", fontsize=ANNOTATION_SIZE, color=COLORS["dark"]) |
| ax.text(6.08, 0.9669, "0.9669 val", fontsize=ANNOTATION_SIZE, color=COLORS["blue"], va="bottom") |
| ax.text(6.08, 0.9663, "0.9663 public", fontsize=ANNOTATION_SIZE, color=COLORS["red"], va="top") |
| files = save_all(fig, key, dirs) |
| return result(key, "ok", files, ["figures_v2/data/manual_metrics.csv", "README.md", "reports/*.md"]) |
|
|
|
|
| def make_fig4(root: Path, dirs: dict[str, Path]) -> dict: |
| key = "fig4_method_pipeline" |
| fig, ax = plt.subplots(figsize=(DOUBLE_COL, 3.8), constrained_layout=True) |
| ax.set_xlim(0, 1) |
| ax.set_ylim(0, 1) |
| ax.axis("off") |
|
|
| def box(x, y, w, h, text, fc, ec, fs=7.4): |
| patch = FancyBboxPatch((x, y), w, h, boxstyle="round,pad=0.018,rounding_size=0.02", fc=fc, ec=ec, lw=0.9) |
| ax.add_patch(patch) |
| ax.text(x + w / 2, y + h / 2, text, ha="center", va="center", fontsize=fs, color=COLORS["dark"], linespacing=1.15) |
| return patch |
|
|
| ax.text(0.14, 0.94, "Input graphs", ha="center", fontsize=SUBTITLE_SIZE, color=COLORS["dark"]) |
| ax.text(0.50, 0.94, "Feature families", ha="center", fontsize=SUBTITLE_SIZE, color=COLORS["dark"]) |
| ax.text(0.84, 0.94, "Stacker and decision", ha="center", fontsize=SUBTITLE_SIZE, color=COLORS["dark"]) |
|
|
| left = [ |
| box(0.03, 0.73, 0.22, 0.10, "author-paper\ninteractions", COLORS["pale_blue"], COLORS["blue"]), |
| box(0.03, 0.56, 0.22, 0.10, "coauthor\ngraph", COLORS["pale_orange"], COLORS["orange"]), |
| box(0.03, 0.39, 0.22, 0.10, "paper citation\ngraph", COLORS["pale_red"], COLORS["red"]), |
| box(0.03, 0.22, 0.22, 0.10, "paper content\nfeatures", COLORS["pale_green"], COLORS["green"]), |
| ] |
| mid = [ |
| box(0.35, 0.75, 0.28, 0.10, "LightGCN score\nand ranks", COLORS["pale_blue"], COLORS["blue"]), |
| box(0.35, 0.60, 0.28, 0.10, "graph/meta-path\nlocal features", COLORS["light_gray"], COLORS["gray"]), |
| box(0.35, 0.45, 0.28, 0.10, "content, BPR-MF,\nvariant scores", COLORS["pale_green"], COLORS["green"]), |
| box(0.35, 0.30, 0.28, 0.10, "DeepWalk/Node2Vec\n7 RW blocks", COLORS["pale_orange"], COLORS["orange"]), |
| box(0.35, 0.15, 0.28, 0.11, "citation propagation\nH_k=RC^k; G_k=SRC^k", COLORS["pale_red"], COLORS["red"], fs=7.0), |
| ] |
| stack = box(0.73, 0.54, 0.22, 0.13, "LightGBM OOF\nstacker (~259 feats)", "#F5F5F5", COLORS["dark"], fs=7.5) |
| decision = box(0.73, 0.33, 0.22, 0.12, "rank cutoff\ntop 50% + known positives", COLORS["pale_green"], COLORS["green"], fs=7.1) |
| outbox = box(0.78, 0.14, 0.12, 0.08, "submission", COLORS["pale_blue"], COLORS["blue"], fs=7.4) |
|
|
| def arrow(start, end, rad=0.0): |
| ax.add_patch( |
| FancyArrowPatch( |
| start, |
| end, |
| arrowstyle="-|>", |
| mutation_scale=9, |
| lw=0.8, |
| color=COLORS["gray"], |
| connectionstyle=f"arc3,rad={rad}", |
| ) |
| ) |
|
|
| for i, patch in enumerate(left): |
| y0 = patch.get_y() + patch.get_height() / 2 |
| targets = [0.80, 0.65, 0.50, 0.35, 0.205] |
| for t in targets[max(0, i - 1) : min(len(targets), i + 2)]: |
| arrow((0.25, y0), (0.35, t), rad=0.03) |
| for patch in mid: |
| y0 = patch.get_y() + patch.get_height() / 2 |
| arrow((0.63, y0), (0.73, 0.605), rad=0.0) |
| arrow((0.84, 0.54), (0.84, 0.45)) |
| arrow((0.84, 0.33), (0.84, 0.22)) |
| files = save_all(fig, key, dirs) |
| return result(key, "ok", files, ["README.md", "CLAUDE.md", "code/high_order_graph_stack.py"]) |
|
|
|
|
| def make_fig5(root: Path, dirs: dict[str, Path]) -> dict: |
| key = "fig5_highorder_ablation" |
| df = high_order_summary(root).set_index("stage") |
| order = ["base_highorder", "rich_rw7", "rich_rw7_highorder", "rich_rw7_highorder_directed"] |
| df = df.loc[order] |
| labels = ["base +\nundir.", "+ rich content\n+ 7 RW", "+ undir.\nhigh-order", "+ directed\ncitation"] |
| x = np.arange(len(order)) |
| f1 = df["validation_f1"].to_numpy(float) |
| auc = df["auc"].to_numpy(float) |
| fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(DOUBLE_COL, 2.75), constrained_layout=True) |
| panel_label(ax1, "(a)") |
| panel_label(ax2, "(b)") |
| ax1.plot(x, f1, "-o", color=COLORS["blue"], markersize=MARKER_SIZE) |
| ax1.set_xticks(x) |
| ax1.set_xticklabels(labels) |
| ax1.set_ylabel("Validation F1\n(zoomed)") |
| ax1.set_title("F1 by stage") |
| ax1.set_ylim(f1.min() - 0.00055, f1.max() + 0.00055) |
| for i in range(1, len(f1)): |
| ax1.text(i, f1[i] + 0.00018, f"+{f1[i] - f1[i - 1]:.5f}", ha="center", fontsize=ANNOTATION_SIZE, color=COLORS["dark"]) |
| ax2.plot(x, auc, "-s", color=COLORS["purple"], markersize=MARKER_SIZE) |
| ax2.set_xticks(x) |
| ax2.set_xticklabels(labels) |
| ax2.set_ylabel("AUC\n(zoomed)") |
| ax2.set_title("AUC by stage") |
| ax2.set_ylim(auc.min() - 0.00035, auc.max() + 0.00022) |
| for xi, val in zip(x, auc): |
| ax2.text(xi, val + 0.00006, f"{val:.5f}", ha="center", fontsize=ANNOTATION_SIZE, color=COLORS["purple"]) |
| files = save_all(fig, key, dirs) |
| source = "validation_runs/dynamic_seed202/high_order_graph_stack/validation_summary.csv" |
| status = "ok" if path_exists(root / source) else "fallback" |
| return result(key, status, files, [source]) |
|
|
|
|
| def make_fig6(root: Path, dirs: dict[str, Path]) -> dict: |
| key = "fig6_calibration_rank_cutoff" |
| ratio_path = root / "validation_runs" / "stack_ratio_analysis.csv" |
| threshold_path = root / "validation_runs" / "dynamic_seed202" / "high_order_graph_stack" / "threshold_submission_summary.csv" |
| ratio = read_table(ratio_path) |
| thr = read_table(threshold_path) |
| fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(DOUBLE_COL, 2.85), constrained_layout=True, gridspec_kw={"width_ratios": [1.35, 1.0]}) |
| panel_label(ax1, "(a)") |
| panel_label(ax2, "(b)") |
| if ratio is not None: |
| ratio = ratio.sort_values("ratio") |
| ax1.plot(ratio["ratio"], ratio["f1_mean"], "-o", color=COLORS["blue"], markersize=3.4, label="validation F1") |
| ax1.fill_between(ratio["ratio"], ratio["f1_min"], ratio["f1_max"], color=COLORS["blue"], alpha=0.12, linewidth=0) |
| y_rank = float(np.interp(0.500, ratio["ratio"], ratio["f1_mean"])) |
| y_prob = float(np.interp(0.524195, ratio["ratio"], ratio["f1_mean"])) |
| else: |
| xs = np.linspace(0.48, 0.53, 30) |
| ys = 0.9556 - 5.0 * (xs - 0.50) ** 2 |
| ax1.plot(xs, ys, "-o", color=COLORS["blue"], markersize=3.0, label="validation F1") |
| y_rank = 0.9556 |
| y_prob = 0.9527 |
| ax1.axvline(0.500, color=COLORS["green"], lw=1.3, label="rank cutoff 0.500") |
| ax1.axvline(0.524195, color=COLORS["red"], lw=1.3, ls="--", label="prob. threshold ratio 0.524") |
| ax1.scatter([0.500], [y_rank], color=COLORS["green"], s=28, zorder=5) |
| ax1.scatter([0.524195], [y_prob], color=COLORS["red"], s=28, zorder=5) |
| ax1.set_xlabel("Predicted-positive ratio") |
| ax1.set_ylabel("Validation F1") |
| ax1.set_title("Ratio sweep") |
| ax1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.42), ncol=2, fontsize=6.7) |
|
|
| prob_ratio = 0.524195 |
| if thr is not None and "th0.461731" in set(thr["rule"]): |
| prob_ratio = float(thr.loc[thr["rule"].eq("th0.461731"), "positive_ratio"].iloc[0]) |
| names = ["rank\ncutoff", "probability\nthreshold"] |
| val = [0.500, 0.500] |
| test = [0.500, prob_ratio] |
| xx = np.arange(2) |
| w = 0.34 |
| ax2.bar(xx - w / 2, val, w, color=COLORS["green"], label="validation", edgecolor="white", linewidth=BAR_EDGEWIDTH) |
| ax2.bar(xx + w / 2, test, w, color=COLORS["red"], label="test", edgecolor="white", linewidth=BAR_EDGEWIDTH) |
| for xi, v, t in zip(xx, val, test): |
| ax2.text(xi - w / 2, v + 0.003, f"{v:.3f}", ha="center", fontsize=ANNOTATION_SIZE) |
| ax2.text(xi + w / 2, t + 0.003, f"{t:.3f}", ha="center", fontsize=ANNOTATION_SIZE, color=COLORS["red"] if abs(t - v) > 0.01 else COLORS["dark"]) |
| ax2.set_xticks(xx) |
| ax2.set_xticklabels(names) |
| ax2.set_ylim(0.46, 0.54) |
| ax2.set_ylabel("Positive ratio") |
| ax2.set_title("Test drift") |
| ax2.legend(loc="upper left", fontsize=6.8) |
| files = save_all(fig, key, dirs) |
| sources = [str(p.relative_to(root)) for p in [ratio_path, threshold_path] if path_exists(p)] |
| return result(key, "ok" if sources else "fallback", files, sources or ["reported fallback values"]) |
|
|
|
|
| def make_figA1(root: Path, dirs: dict[str, Path]) -> dict: |
| key = "figA1_lightgcn_sweep" |
| csv_path = root / "validation_runs" / "dynamic_summary.csv" |
| df = read_table(csv_path) |
| if df is None: |
| return result(key, "skipped", [], [str(csv_path.relative_to(root))], "dynamic_summary.csv is missing.") |
| work = df[df["split"].eq("dynamic_seed202")].copy() |
| parsed = work["run"].str.extract(r"l(?P<layers>\d+)d(?P<dim>\d+)").astype(float) |
| work = pd.concat([work, parsed], axis=1).dropna(subset=["layers", "dim"]) |
| work["layers"] = work["layers"].astype(int) |
| work["dim"] = work["dim"].astype(int) |
| piv = work.groupby(["dim", "layers"])["f1"].max().unstack() |
| fig, ax = plt.subplots(figsize=(SINGLE_COL * 1.35, 2.85), constrained_layout=True) |
| data = piv.to_numpy(float) |
| cmap = plt.cm.viridis.copy() |
| cmap.set_bad(COLORS["light_gray"]) |
| img = ax.imshow(np.ma.masked_invalid(data), cmap=cmap, aspect="auto") |
| ax.set_xticks(np.arange(len(piv.columns))) |
| ax.set_xticklabels([str(c) for c in piv.columns]) |
| ax.set_yticks(np.arange(len(piv.index))) |
| ax.set_yticklabels([str(i) for i in piv.index]) |
| for i in range(data.shape[0]): |
| for j in range(data.shape[1]): |
| val = data[i, j] |
| ax.text(j, i, "" if math.isnan(val) else f"{val:.5f}", ha="center", va="center", fontsize=6.5, color="white" if not math.isnan(val) and val > np.nanmedian(data) else COLORS["dark"]) |
| if 512 in piv.index and 2 in piv.columns: |
| ax.add_patch(Rectangle((list(piv.columns).index(2) - 0.5, list(piv.index).index(512) - 0.5), 1, 1, fill=False, ec=COLORS["red"], lw=1.0)) |
| ax.set_xlabel("Layers") |
| ax.set_ylabel("Embedding dim") |
| ax.set_title("LightGCN validation sweep") |
| cbar = fig.colorbar(img, ax=ax, shrink=0.78) |
| cbar.set_label("Validation F1") |
| files = save_all(fig, key, dirs) |
| return result(key, "ok", files, [str(csv_path.relative_to(root))]) |
|
|
|
|
| def make_figA2(root: Path, dirs: dict[str, Path]) -> dict: |
| key = "figA2_rw_ensemble" |
| sizes, f1s, sources = rw_ensemble_metrics(root) |
| fig, ax = plt.subplots(figsize=(SINGLE_COL, 2.3), constrained_layout=True) |
| ax.plot(sizes, f1s, "-o", color=COLORS["green"], markersize=MARKER_SIZE) |
| ax.set_xticks(sizes) |
| ax.set_xlabel("# random-walk blocks") |
| ax.set_ylabel("Validation F1") |
| ax.set_title("RW ensemble size") |
| ax.set_ylim(min(f1s) - 0.00035, max(f1s) + 0.00035) |
| for s, f in zip(sizes, f1s): |
| ax.text(s, f + 0.00012, f"{f:.5f}", ha="center", fontsize=ANNOTATION_SIZE) |
| files = save_all(fig, key, dirs) |
| return result(key, "ok", files, sources) |
|
|
|
|
| def make_figA3(root: Path, dirs: dict[str, Path]) -> dict: |
| key = "figA3_feature_group_contribution" |
| groups = [ |
| ("Graph/meta-path", 0.0174, COLORS["blue"]), |
| ("7 RW blocks", 0.0028, COLORS["orange"]), |
| ("DeepWalk/Node2Vec", 0.0022, COLORS["green"]), |
| ("High-order", 0.0020, COLORS["red"]), |
| ("BPR-MF", 0.0017, COLORS["purple"]), |
| ("Variant scores", 0.0011, COLORS["gray"]), |
| ("Rich content", 0.0006, COLORS["green"]), |
| ("Mean-cos content", 0.0005, COLORS["blue"]), |
| ] |
| labels, gains, colors = zip(*groups) |
| y = np.arange(len(groups)) |
| fig, ax = plt.subplots(figsize=(SINGLE_COL * 1.45, 3.0), constrained_layout=True) |
| ax.barh(y, gains, color=colors, edgecolor="white", linewidth=BAR_EDGEWIDTH) |
| ax.set_yticks(y) |
| ax.set_yticklabels(labels) |
| ax.invert_yaxis() |
| ax.set_xlim(0, 0.0185) |
| ax.set_xlabel("Incremental validation F1 gain") |
| ax.set_title("Feature-group contribution") |
| for yi, gain in zip(y, gains): |
| if gain > 0.010: |
| ax.text(gain - 0.00045, yi, f"+{gain:.4f}", ha="right", va="center", fontsize=ANNOTATION_SIZE, color="white") |
| else: |
| ax.text(gain + 0.00025, yi, f"+{gain:.4f}", ha="left", va="center", fontsize=ANNOTATION_SIZE) |
| files = save_all(fig, key, dirs) |
| return result(key, "ok", files, ["figures_v2/data/manual_metrics.csv", "reports/*.md"]) |
|
|
|
|
| def small_heatmap(ax, sub: pd.DataFrame, title: str) -> None: |
| sub = sub.copy().sort_values("f1").head(8) |
| mat = sub[["precision", "recall", "f1"]].to_numpy(float) |
| labels = [str(v).replace("_", " ").replace("[", " [").replace("degree", "deg") for v in sub["bucket"]] |
| img = ax.imshow(mat, cmap="RdYlGn", vmin=0.0, vmax=1.0, aspect="auto") |
| ax.set_title(title) |
| ax.set_xticks([0, 1, 2]) |
| ax.set_xticklabels(["prec.", "rec.", "F1"]) |
| ax.set_yticks(np.arange(len(labels))) |
| ax.set_yticklabels(labels, fontsize=6.3) |
| for i in range(mat.shape[0]): |
| for j in range(mat.shape[1]): |
| ax.text(j, i, f"{mat[i, j]:.2f}", ha="center", va="center", fontsize=6.0, color=COLORS["dark"]) |
| if mat[i, 2] < 0.90: |
| ax.add_patch(Rectangle((-0.5, i - 0.5), 3, 1, fill=False, ec=COLORS["red"], lw=0.8)) |
| return img |
|
|
|
|
| def make_figA4(root: Path, dirs: dict[str, Path]) -> dict: |
| key = "figA4_error_buckets" |
| csv_path = root / "validation_runs" / "dynamic_seed202" / "error_group_calibration" / "error_analysis_buckets.csv" |
| df = read_table(csv_path) |
| if df is None: |
| return result(key, "skipped", [], [str(csv_path.relative_to(root))], "error_analysis_buckets.csv is missing.") |
| df["lo"] = df["bucket"].map(numeric_bucket_key) |
| degree = df[df["bucket_type"].isin(["author_degree", "paper_degree", "paper_citation_in"])].sort_values(["bucket_type", "lo"]) |
| score = df[df["bucket_type"].isin(["final_score", "LightGCN_score", "author_internal_rank"])].copy() |
| local = df[df["bucket_type"].isin(["has_local_evidence", "BPR-MF_score", "content_score", "DeepWalk_score", "Node2Vec_score"])].copy() |
| fig, axes = plt.subplots(1, 3, figsize=(DOUBLE_COL, 3.9), constrained_layout=True) |
| imgs = [ |
| small_heatmap(axes[0], degree, "(a) Degree buckets"), |
| small_heatmap(axes[1], score, "(b) Rank / score buckets"), |
| small_heatmap(axes[2], local, "(c) Local evidence buckets"), |
| ] |
| cbar = fig.colorbar(imgs[-1], ax=axes.ravel().tolist(), shrink=0.72, pad=0.02) |
| cbar.set_label("Score") |
| files = save_all(fig, key, dirs) |
| return result(key, "ok", files, [str(csv_path.relative_to(root))]) |
|
|
|
|
| def ecdf(vals: np.ndarray) -> tuple[np.ndarray, np.ndarray]: |
| vals = np.sort(vals) |
| y = np.arange(1, len(vals) + 1) / len(vals) |
| return vals, y |
|
|
|
|
| def make_figA5(root: Path, dirs: dict[str, Path]) -> dict: |
| key = "figA5_oof_pr_score" |
| vr = root / "validation_runs" / "dynamic_seed202" |
| y = load_npy(vr / "val_labels_seed202.npy") |
| if y is None: |
| return result(key, "skipped", [], ["validation_runs/dynamic_seed202/val_labels_seed202.npy"], "Validation labels are missing.") |
| models = [ |
| ("LightGCN", vr / "dyn202_l2d512_bpr_bigbatch_more" / "scores" / "val_vanilla_ensemble_mean.npy", COLORS["gray"]), |
| ("Graph stack", vr / "post95_ablation" / "ensemble_lgcn_oof.npy", COLORS["orange"]), |
| ("DeepWalk/Node2Vec", vr / "node2vec_deepwalk" / "node2vec_stack_oof.npy", COLORS["green"]), |
| ("High-order final", vr / "high_order_graph_stack" / "rich_rw7_highorder_directed_oof.npy", COLORS["red"]), |
| ] |
| loaded = [] |
| for name, path, color in models: |
| scores = load_npy(path) |
| if scores is not None and len(scores) == len(y): |
| loaded.append((name, scores.astype(float), color, path)) |
| if not loaded: |
| return result(key, "skipped", [], ["OOF score arrays"], "No aligned OOF score arrays were found.") |
| fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(DOUBLE_COL, 2.9), constrained_layout=True, gridspec_kw={"width_ratios": [1.25, 1.0]}) |
| panel_label(ax1, "(a)") |
| panel_label(ax2, "(b)") |
| y = y.astype(int) |
| for name, scores, color, _ in loaded: |
| recall, precision, ap = pr_curve(y, scores) |
| ax1.plot(recall, precision, color=color, lw=LINEWIDTH, label=f"{name} AP={ap:.4f}") |
| ax1.set_xlabel("Recall") |
| ax1.set_ylabel("Precision") |
| ax1.set_ylim(0.88, 1.005) |
| ax1.set_title("OOF PR curves") |
| ax1.legend(loc="lower left", fontsize=6.5) |
| final_name, final_scores, _, _ = loaded[-1] |
| pos_x, pos_y = ecdf(final_scores[y == 1]) |
| neg_x, neg_y = ecdf(final_scores[y == 0]) |
| lo, hi = np.percentile(final_scores, [1, 99]) |
| ax2.plot(neg_x, neg_y, color=COLORS["blue"], label="negative") |
| ax2.plot(pos_x, pos_y, color=COLORS["red"], label="positive") |
| ax2.set_xlim(lo, hi) |
| ax2.set_xlabel("Final OOF score (1st-99th pct.)") |
| ax2.set_ylabel("ECDF") |
| ax2.set_title("Final score ECDF") |
| ax2.legend(loc="lower right", fontsize=LEGEND_SIZE) |
| files = save_all(fig, key, dirs) |
| sources = [str((vr / "val_labels_seed202.npy").relative_to(root))] + [str(p.relative_to(root)) for _, _, _, p in loaded] |
| status = "ok" if len(loaded) == len(models) else "partial" |
| return result(key, status, files, sources, "Only aligned OOF arrays are plotted.") |
|
|
|
|
| def make_figA6(root: Path, dirs: dict[str, Path]) -> dict: |
| key = "figA6_feature_importance" |
| return result( |
| key, |
| "skipped", |
| [], |
| ["cached_scores/lgb_model.pkl", "cached_scores/lgb_v2_model.pkl"], |
| "LightGBM is not importable in this environment and reliable feature names are not available; no importance plot was generated.", |
| ) |
|
|
|
|
| FIGURE_BUILDERS = [ |
| make_fig1, |
| make_fig2, |
| make_fig3, |
| make_fig4, |
| make_fig5, |
| make_fig6, |
| make_figA1, |
| make_figA2, |
| make_figA3, |
| make_figA4, |
| make_figA5, |
| make_figA6, |
| ] |
|
|
| BUILDER_KEYS = { |
| make_fig1: "fig1_task_graph", |
| make_fig2: "fig2_dataset_sparsity", |
| make_fig3: "fig3_performance_evolution", |
| make_fig4: "fig4_method_pipeline", |
| make_fig5: "fig5_highorder_ablation", |
| make_fig6: "fig6_calibration_rank_cutoff", |
| make_figA1: "figA1_lightgcn_sweep", |
| make_figA2: "figA2_rw_ensemble", |
| make_figA3: "figA3_feature_group_contribution", |
| make_figA4: "figA4_error_buckets", |
| make_figA5: "figA5_oof_pr_score", |
| make_figA6: "figA6_feature_importance", |
| } |
|
|
|
|
| def write_readme(root: Path, results: list[dict]) -> None: |
| lines = [ |
| "# figures_v2 outputs", |
| "", |
| "Generated by `figures_v2/scripts/make_all_figures.py`.", |
| "", |
| "Run:", |
| "", |
| "```bash", |
| "python figures_v2/scripts/make_all_figures.py --package-root .", |
| "```", |
| "", |
| "Use PDF files in ACM LaTeX for vector output; PNG files are 300 dpi previews.", |
| "", |
| ] |
| for r in results: |
| lines.extend( |
| [ |
| f"## {r['key']}", |
| "", |
| f"Figure ID: `{r['key']}`", |
| f"Output files: {', '.join('`' + str(Path(f).relative_to(root)).replace(chr(92), '/') + '`' for f in r['files']) if r['files'] else 'skipped'}", |
| f"Data source: {'; '.join(r['sources'])}", |
| f"Purpose in paper: {r['purpose']}", |
| f"Caption draft: {r['caption']}", |
| f"Known limitations: {r['known_limitations'] or 'None.'}", |
| "", |
| ] |
| ) |
| lines.extend( |
| [ |
| "## LaTeX insertion", |
| "", |
| "Single-column figure:", |
| "", |
| "```latex", |
| "\\begin{figure}", |
| " \\centering", |
| " \\includegraphics[width=\\columnwidth]{figures_v2/pdf/fig2_dataset_sparsity.pdf}", |
| " \\caption{...}", |
| "\\end{figure}", |
| "```", |
| "", |
| "Double-column figure:", |
| "", |
| "```latex", |
| "\\begin{figure*}", |
| " \\centering", |
| " \\includegraphics[width=\\textwidth]{figures_v2/pdf/fig4_method_pipeline.pdf}", |
| " \\caption{...}", |
| "\\end{figure*}", |
| "```", |
| "", |
| ] |
| ) |
| (root / "figures_v2" / "README_FIGURES.md").write_text("\n".join(lines), encoding="utf-8") |
|
|
|
|
| def validate_outputs(root: Path, results: list[dict]) -> pd.DataFrame: |
| rows = [] |
| for r in results: |
| for file in r["files"]: |
| p = Path(file) |
| rows.append( |
| { |
| "figure": r["key"], |
| "path": p.relative_to(root).as_posix(), |
| "exists": p.exists(), |
| "size_bytes": p.stat().st_size if p.exists() else 0, |
| "nonempty": p.exists() and p.stat().st_size > 1000, |
| } |
| ) |
| df = pd.DataFrame(rows) |
| out = root / "figures_v2" / "data" / "output_validation.csv" |
| df.to_csv(out, index=False) |
| return df |
|
|
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--package-root", default=".", help="Repository root.") |
| args = parser.parse_args() |
| root = Path(args.package_root).resolve() |
| apply_style() |
| dirs = output_dirs(root) |
| inventory_files(root, root / "figures_v2" / "data" / "audit_file_inventory.csv") |
|
|
| results = [] |
| for builder in FIGURE_BUILDERS: |
| try: |
| r = builder(root, dirs) |
| except Exception as exc: |
| key = BUILDER_KEYS.get(builder, builder.__name__.replace("make_", "")) |
| r = result(key, "error", [], [], f"{type(exc).__name__}: {exc}") |
| results.append(r) |
| print(f"[{r['status']:<7}] {r['key']}") |
|
|
| write_readme(root, results) |
| validation = validate_outputs(root, results) |
| print("\nGenerated files:") |
| if validation.empty: |
| print(" none") |
| else: |
| for _, row in validation.iterrows(): |
| print(f" {row['path']} ({row['size_bytes']} bytes)") |
| print("\nREADME:", (root / "figures_v2" / "README_FIGURES.md").as_posix()) |
| print("Audit inventory:", (root / "figures_v2" / "data" / "audit_file_inventory.csv").as_posix()) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|