"""Fig 1: dataset overview — degree distributions (log-log CCDF + power-law fit). Reads the raw edge files and plots complementary CDFs for co-author degree, paper read-degree, and paper in-citation degree. Pure numpy/pandas. """ from pathlib import Path import sys import numpy as np import pandas as pd import matplotlib.pyplot as plt sys.path.insert(0, str(Path(__file__).resolve().parent)) from plot_style import apply, save, PALETTE_DEEP as C # noqa: E402 apply() ROOT = Path(__file__).resolve().parents[2] FIG = ROOT / "reports" / "figures" DD = ROOT / "data_and_docs" co = pd.read_csv(DD / "author_file_ann.txt", sep=r"\s+", header=None) cite = pd.read_csv(DD / "paper_file_ann.txt", sep=r"\s+", header=None) read = pd.read_csv(DD / "bipartite_train_ann.txt", sep=r"\s+", header=None) co_deg = np.bincount(np.concatenate([co[0].to_numpy(), co[1].to_numpy()])) # undirected read_deg = np.bincount(read[1].to_numpy()) # paper read-degree cite_in_deg = np.bincount(cite[1].to_numpy()) # paper in-degree def plot_ccdf(deg, ax, color, title): v = np.sort(deg[deg > 0]) uniq, cnt = np.unique(v, return_counts=True) cum = np.cumsum(cnt) n = len(v) ccdf = (n - cum + cnt) / n ax.loglog(uniq, ccdf, ".", color=color, markersize=4) # power-law fit on the log-log CCDF x = np.log10(uniq) y = np.log10(ccdf) slope, intercept = np.polyfit(x, y, 1) xs = np.linspace(x.min(), x.max(), 50) ax.loglog(10 ** xs, 10 ** (slope * xs + intercept), "-", color=color, lw=1.5, label=f"slope={slope:.2f}") ax.set_title(title) ax.set_xlabel("degree k") ax.set_ylabel("P(degree ≥ k)") ax.legend(fontsize=9) fig, axes = plt.subplots(1, 3, figsize=(15, 4.6)) plot_ccdf(co_deg, axes[0], C[0], "(a) Co-author degree") plot_ccdf(read_deg, axes[1], C[2], "(b) Paper read-degree") plot_ccdf(cite_in_deg, axes[2], C[3], "(c) Paper in-citation degree") stats = ("6,611 authors / 79,937 papers\n" "co-author edges: 9,663 | citations: 327,113\n" "train read edges: 682,421 | test pairs: 2,047,262\n" "bipartite density: 1.29e-3 | 56% of authors have degree 1") fig.text(0.5, -0.04, stats, ha="center", fontsize=9, color="dimgray") fig.suptitle("Dataset overview: heavy-tailed degree distributions (power-law CCDF)", y=1.02) save(fig, "fig1_dataset_overview", FIG) print("saved fig1_dataset_overview")