cs3319-project2 / figures_paper /scripts /fig2_sparsity.py
NLP-beginner's picture
CS3319 Project 2 final deliverable (public F1 = 0.96626)
f28d994
Raw
History Blame Contribute Delete
4.07 kB
"""Figure 2 — Dataset sparsity and long-tail structure (2x2).
Reads the four official edge files and plots log-log CCDFs for co-author degree,
paper citation in-degree, and paper author-interaction (read) degree, plus a
linear-scale long-tail panel. Pure numpy/pandas.
"""
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from style import apply, save, PALETTE as C, COL2, MUTED # noqa: E402
KEY = "fig2_sparsity"
TITLE = "Figure 2. Dataset sparsity and long-tail degree structure"
def _ccdf(deg):
v = np.sort(deg[deg > 0])
uniq, cnt = np.unique(v, return_counts=True)
cum = np.cumsum(cnt)
n = len(v)
ccdf = (n - cum + cnt) / n
slope, _ = np.polyfit(np.log10(uniq), np.log10(ccdf), 1)
return uniq, ccdf, slope
def _panel_ccdf(ax, deg, color, title):
uniq, ccdf, slope = _ccdf(deg)
ax.loglog(uniq, ccdf, ".", color=color, markersize=3.5)
x = np.log10(uniq)
a, b = np.polyfit(x, np.log10(ccdf), 1)
xs = np.linspace(x.min(), x.max(), 50)
ax.loglog(10 ** xs, 10 ** (a * xs + b), "-", color=color, lw=1.4, alpha=0.7)
ax.text(0.62, 0.86, f"slope ≈ {slope:.2f}", transform=ax.transAxes, fontsize=8, color=color)
ax.set_title(title); ax.set_xlabel("degree $k$"); ax.set_ylabel("$P(\\mathrm{deg} \\geq k)$")
def make(root, out):
apply()
DD = root / "data_and_docs"
try:
co = pd.read_csv(DD / "author_file_ann.txt", sep=r"\s+", header=None)
cite = pd.read_csv(DD / "paper_file_ann.txt", sep=r"\s+", header=None)
read = pd.read_csv(DD / "bipartite_train_ann.txt", sep=r"\s+", header=None)
except FileNotFoundError as e:
return dict(key=KEY, title=TITLE, status="skipped", files=[], sources=[], caption=str(e),
note=f"missing data file: {e}")
co_deg = np.bincount(np.concatenate([co[0].to_numpy(), co[1].to_numpy()]))
cite_in = np.bincount(cite[1].to_numpy())
read_deg = np.bincount(read[1].to_numpy())
fig, axes = plt.subplots(2, 2, figsize=(COL2, 4.6))
_panel_ccdf(axes[0, 0], co_deg, C[0], "(a) Co-author degree")
_panel_ccdf(axes[0, 1], cite_in, C[3], "(b) Paper citation in-degree")
_panel_ccdf(axes[1, 0], read_deg, C[2], "(c) Paper author-interaction degree")
# (d) long-tail linear view: fraction of nodes at small degrees
ax = axes[1, 1]
for deg, color, lab in [(co_deg, C[0], "author co-author"),
(read_deg, C[2], "paper read")]:
v = deg[deg > 0]
ks = np.arange(1, 11)
frac = np.array([(v == k).mean() for k in ks])
ax.bar(ks + (0.2 if "read" in lab else -0.2), frac, width=0.4, color=color,
alpha=0.85, label=lab)
ax.set_title("(d) Low-degree mass (cold-start)")
ax.set_xlabel("degree $k$"); ax.set_ylabel("fraction of nodes")
ax.set_xticks(range(1, 11)); ax.legend(fontsize=7)
ax.text(0.40, 0.82,
f"56% of authors have degree 1\nbipartite density ≈ 1.3e-3",
transform=ax.transAxes, fontsize=7.5, color="dimgray")
fig.suptitle("Dataset overview: heavy-tailed, sparse academic graph", fontsize=10, y=1.0)
save(fig, KEY, out)
return dict(key=KEY, title=TITLE, status="ok",
files=[f"{KEY}.pdf", f"{KEY}.png", f"{KEY}.svg"],
sources=[str(DD / "author_file_ann.txt"), str(DD / "paper_file_ann.txt"),
str(DD / "bipartite_train_ann.txt")],
caption=(
"Dataset sparsity and long-tail degree structure. (a–c) Complementary CDFs (log–log) "
"of co-author degree, paper citation in-degree, and paper author-interaction degree; "
"straight-line fits confirm approximate power-law tails. (d) The mass of nodes sits at "
"degree 1 — 56% of authors have a single connection — so cold-start nodes dominate and "
"motivate structural / high-order features."))
if __name__ == "__main__":
from style import ensure_dirs
r = make(Path("."), ensure_dirs(Path(".")))
print(r["key"], r["status"])