| """Fig 1: dataset overview — degree distributions (log-log CCDF + power-law fit). |
| |
| Reads the raw edge files and plots complementary CDFs for co-author degree, |
| paper read-degree, and paper in-citation degree. Pure numpy/pandas. |
| """ |
| from pathlib import Path |
| import sys |
|
|
| import numpy as np |
| import pandas as pd |
| import matplotlib.pyplot as plt |
|
|
| sys.path.insert(0, str(Path(__file__).resolve().parent)) |
| from plot_style import apply, save, PALETTE_DEEP as C |
|
|
| apply() |
| ROOT = Path(__file__).resolve().parents[2] |
| FIG = ROOT / "reports" / "figures" |
| DD = ROOT / "data_and_docs" |
|
|
| co = pd.read_csv(DD / "author_file_ann.txt", sep=r"\s+", header=None) |
| cite = pd.read_csv(DD / "paper_file_ann.txt", sep=r"\s+", header=None) |
| read = pd.read_csv(DD / "bipartite_train_ann.txt", sep=r"\s+", header=None) |
|
|
| co_deg = np.bincount(np.concatenate([co[0].to_numpy(), co[1].to_numpy()])) |
| read_deg = np.bincount(read[1].to_numpy()) |
| cite_in_deg = np.bincount(cite[1].to_numpy()) |
|
|
|
|
| def plot_ccdf(deg, ax, color, title): |
| v = np.sort(deg[deg > 0]) |
| uniq, cnt = np.unique(v, return_counts=True) |
| cum = np.cumsum(cnt) |
| n = len(v) |
| ccdf = (n - cum + cnt) / n |
| ax.loglog(uniq, ccdf, ".", color=color, markersize=4) |
| |
| x = np.log10(uniq) |
| y = np.log10(ccdf) |
| slope, intercept = np.polyfit(x, y, 1) |
| xs = np.linspace(x.min(), x.max(), 50) |
| ax.loglog(10 ** xs, 10 ** (slope * xs + intercept), "-", color=color, lw=1.5, |
| label=f"slope={slope:.2f}") |
| ax.set_title(title) |
| ax.set_xlabel("degree k") |
| ax.set_ylabel("P(degree ≥ k)") |
| ax.legend(fontsize=9) |
|
|
|
|
| fig, axes = plt.subplots(1, 3, figsize=(15, 4.6)) |
| plot_ccdf(co_deg, axes[0], C[0], "(a) Co-author degree") |
| plot_ccdf(read_deg, axes[1], C[2], "(b) Paper read-degree") |
| plot_ccdf(cite_in_deg, axes[2], C[3], "(c) Paper in-citation degree") |
|
|
| stats = ("6,611 authors / 79,937 papers\n" |
| "co-author edges: 9,663 | citations: 327,113\n" |
| "train read edges: 682,421 | test pairs: 2,047,262\n" |
| "bipartite density: 1.29e-3 | 56% of authors have degree 1") |
| fig.text(0.5, -0.04, stats, ha="center", fontsize=9, color="dimgray") |
| fig.suptitle("Dataset overview: heavy-tailed degree distributions (power-law CCDF)", y=1.02) |
| save(fig, "fig1_dataset_overview", FIG) |
| print("saved fig1_dataset_overview") |
|
|