cs3319-project2 / code /figures /fig1_dataset_overview.py
NLP-beginner's picture
CS3319 Project 2 final deliverable (public F1 = 0.96626)
f28d994
Raw
History Blame Contribute Delete
2.46 kB
"""Fig 1: dataset overview — degree distributions (log-log CCDF + power-law fit).
Reads the raw edge files and plots complementary CDFs for co-author degree,
paper read-degree, and paper in-citation degree. Pure numpy/pandas.
"""
from pathlib import Path
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
sys.path.insert(0, str(Path(__file__).resolve().parent))
from plot_style import apply, save, PALETTE_DEEP as C # noqa: E402
apply()
ROOT = Path(__file__).resolve().parents[2]
FIG = ROOT / "reports" / "figures"
DD = ROOT / "data_and_docs"
co = pd.read_csv(DD / "author_file_ann.txt", sep=r"\s+", header=None)
cite = pd.read_csv(DD / "paper_file_ann.txt", sep=r"\s+", header=None)
read = pd.read_csv(DD / "bipartite_train_ann.txt", sep=r"\s+", header=None)
co_deg = np.bincount(np.concatenate([co[0].to_numpy(), co[1].to_numpy()])) # undirected
read_deg = np.bincount(read[1].to_numpy()) # paper read-degree
cite_in_deg = np.bincount(cite[1].to_numpy()) # paper in-degree
def plot_ccdf(deg, ax, color, title):
v = np.sort(deg[deg > 0])
uniq, cnt = np.unique(v, return_counts=True)
cum = np.cumsum(cnt)
n = len(v)
ccdf = (n - cum + cnt) / n
ax.loglog(uniq, ccdf, ".", color=color, markersize=4)
# power-law fit on the log-log CCDF
x = np.log10(uniq)
y = np.log10(ccdf)
slope, intercept = np.polyfit(x, y, 1)
xs = np.linspace(x.min(), x.max(), 50)
ax.loglog(10 ** xs, 10 ** (slope * xs + intercept), "-", color=color, lw=1.5,
label=f"slope={slope:.2f}")
ax.set_title(title)
ax.set_xlabel("degree k")
ax.set_ylabel("P(degree ≥ k)")
ax.legend(fontsize=9)
fig, axes = plt.subplots(1, 3, figsize=(15, 4.6))
plot_ccdf(co_deg, axes[0], C[0], "(a) Co-author degree")
plot_ccdf(read_deg, axes[1], C[2], "(b) Paper read-degree")
plot_ccdf(cite_in_deg, axes[2], C[3], "(c) Paper in-citation degree")
stats = ("6,611 authors / 79,937 papers\n"
"co-author edges: 9,663 | citations: 327,113\n"
"train read edges: 682,421 | test pairs: 2,047,262\n"
"bipartite density: 1.29e-3 | 56% of authors have degree 1")
fig.text(0.5, -0.04, stats, ha="center", fontsize=9, color="dimgray")
fig.suptitle("Dataset overview: heavy-tailed degree distributions (power-law CCDF)", y=1.02)
save(fig, "fig1_dataset_overview", FIG)
print("saved fig1_dataset_overview")