File size: 2,464 Bytes
f28d994
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
"""Fig 1: dataset overview — degree distributions (log-log CCDF + power-law fit).

Reads the raw edge files and plots complementary CDFs for co-author degree,
paper read-degree, and paper in-citation degree. Pure numpy/pandas.
"""
from pathlib import Path
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

sys.path.insert(0, str(Path(__file__).resolve().parent))
from plot_style import apply, save, PALETTE_DEEP as C  # noqa: E402

apply()
ROOT = Path(__file__).resolve().parents[2]
FIG = ROOT / "reports" / "figures"
DD = ROOT / "data_and_docs"

co = pd.read_csv(DD / "author_file_ann.txt", sep=r"\s+", header=None)
cite = pd.read_csv(DD / "paper_file_ann.txt", sep=r"\s+", header=None)
read = pd.read_csv(DD / "bipartite_train_ann.txt", sep=r"\s+", header=None)

co_deg = np.bincount(np.concatenate([co[0].to_numpy(), co[1].to_numpy()]))      # undirected
read_deg = np.bincount(read[1].to_numpy())                                     # paper read-degree
cite_in_deg = np.bincount(cite[1].to_numpy())                                  # paper in-degree


def plot_ccdf(deg, ax, color, title):
    v = np.sort(deg[deg > 0])
    uniq, cnt = np.unique(v, return_counts=True)
    cum = np.cumsum(cnt)
    n = len(v)
    ccdf = (n - cum + cnt) / n
    ax.loglog(uniq, ccdf, ".", color=color, markersize=4)
    # power-law fit on the log-log CCDF
    x = np.log10(uniq)
    y = np.log10(ccdf)
    slope, intercept = np.polyfit(x, y, 1)
    xs = np.linspace(x.min(), x.max(), 50)
    ax.loglog(10 ** xs, 10 ** (slope * xs + intercept), "-", color=color, lw=1.5,
              label=f"slope={slope:.2f}")
    ax.set_title(title)
    ax.set_xlabel("degree k")
    ax.set_ylabel("P(degree ≥ k)")
    ax.legend(fontsize=9)


fig, axes = plt.subplots(1, 3, figsize=(15, 4.6))
plot_ccdf(co_deg, axes[0], C[0], "(a) Co-author degree")
plot_ccdf(read_deg, axes[1], C[2], "(b) Paper read-degree")
plot_ccdf(cite_in_deg, axes[2], C[3], "(c) Paper in-citation degree")

stats = ("6,611 authors / 79,937 papers\n"
         "co-author edges: 9,663 | citations: 327,113\n"
         "train read edges: 682,421 | test pairs: 2,047,262\n"
         "bipartite density: 1.29e-3 | 56% of authors have degree 1")
fig.text(0.5, -0.04, stats, ha="center", fontsize=9, color="dimgray")
fig.suptitle("Dataset overview: heavy-tailed degree distributions (power-law CCDF)", y=1.02)
save(fig, "fig1_dataset_overview", FIG)
print("saved fig1_dataset_overview")