File size: 11,821 Bytes
1db7196 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 | # Author: Md. Shahidul Salim
# Date: February 12, 2026
import networkx as nx
import pandas as pd
from scipy.stats import pearsonr
import numpy as np
import matplotlib.pyplot as plt
# Extra credit imports
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
import nltk
from nltk.tokenize import word_tokenize
# Ensure NLTK resources are available
try:
nltk.data.find("tokenizers/punkt")
except LookupError:
nltk.download("punkt", quiet=True)
try:
nltk.data.find("tokenizers/punkt_tab")
except LookupError:
# Required by some NLTK versions.
nltk.download("punkt_tab", quiet=True)
def _safe_int_year(value):
try:
return int(value)
except (TypeError, ValueError):
return 0
def _rank_vector(scores, node_order):
"""
Convert centrality scores to rank vectors (1 = highest centrality),
which matches the assignment requirement to correlate rankings.
"""
series = pd.Series({node: scores[node] for node in node_order})
ranks = series.rank(method="average", ascending=False)
return [float(ranks[node]) for node in node_order]
def _tokenize(text):
tokens = word_tokenize(text.lower())
return [tok for tok in tokens if tok.isalpha() and len(tok) > 2]
# Part 1: Weak Tie Analysis
def weaktie_analysis(LCC):
print("\n--- Starting Weak/Strong Tie Analysis ---")
edges_asc = sorted(
LCC.edges(data=True), key=lambda x: float(x[2].get("weight", 0.0))
)
edges_desc = list(reversed(edges_asc))
edge_weights = [float(data.get("weight", 0.0)) for _, _, data in edges_asc]
total_edges = len(edge_weights)
if total_edges == 0:
print("No ties found in the LCC; skipping weak/strong tie removal analysis.")
return
# Use median edge weight as the cutoff:
# weak ties: weight <= median, strong ties: weight > median.
median_weight = float(np.median(edge_weights))
weak_ties = [(u, v, d) for u, v, d in edges_asc if float(d.get("weight", 0.0)) <= median_weight]
strong_ties = [(u, v, d) for u, v, d in edges_asc if float(d.get("weight", 0.0)) > median_weight]
print(f"Total ties in LCC: {total_edges}")
print(f"Weak tie threshold (median weight): {median_weight:.4f}")
print(f"Number of weak ties (weight <= {median_weight:.4f}): {len(weak_ties)}")
print(f"Number of strong ties (weight > {median_weight:.4f}): {len(strong_ties)}")
print("Methodology: remove one tie per step and recompute LCC size after each removal.")
def get_lcc_sizes_by_single_removal(edge_list):
temp_graph = LCC.copy()
total_edges = len(edge_list)
fractions_removed = [0.0]
lcc_sizes = [len(max(nx.connected_components(temp_graph), key=len))]
for idx, (u, v, _) in enumerate(edge_list, start=1):
if temp_graph.has_edge(u, v):
temp_graph.remove_edge(u, v)
if temp_graph.number_of_nodes() > 0:
current_lcc = max(nx.connected_components(temp_graph), key=len)
lcc_sizes.append(len(current_lcc))
else:
lcc_sizes.append(0)
fractions_removed.append(idx / total_edges)
return fractions_removed, lcc_sizes
x_weak, y_weak = get_lcc_sizes_by_single_removal(edges_asc)
x_strong, y_strong = get_lcc_sizes_by_single_removal(edges_desc)
plt.figure(figsize=(10, 6))
plt.plot(x_weak, y_weak, label="Removing Weakest First")
plt.plot(x_strong, y_strong, label="Removing Strongest First")
plt.xlabel("Fraction of Ties Removed")
plt.ylabel("LCC Size (Number of Nodes)")
plt.title("Impact of Weak vs Strong Tie Removal on LCC")
plt.legend()
plt.grid(True, linestyle="--", alpha=0.7)
plt.tight_layout()
plt.show()
# Part 2: Centrality Analysis
def centrality_analysis(LCC):
print("\n--- Starting Centrality Analysis ---")
degree = nx.degree_centrality(LCC)
closeness = nx.closeness_centrality(LCC)
betweenness = nx.betweenness_centrality(LCC)
nodes = list(LCC.nodes())
d_rank = _rank_vector(degree, nodes)
c_rank = _rank_vector(closeness, nodes)
b_rank = _rank_vector(betweenness, nodes)
corr_dc, _ = pearsonr(d_rank, c_rank)
corr_db, _ = pearsonr(d_rank, b_rank)
corr_cb, _ = pearsonr(c_rank, b_rank)
print("\nTable 1: Pearson Correlation between Centrality Measure Rankings")
table = pd.DataFrame(
{
"Metric": ["Degree", "Closeness", "Betweenness"],
"Degree": [1.0, corr_dc, corr_db],
"Closeness": [corr_dc, 1.0, corr_cb],
"Betweenness": [corr_db, corr_cb, 1.0],
}
)
print(table.to_string(index=False, float_format=lambda x: f"{x:.4f}"))
pair_corr = {
("Degree", "Closeness"): corr_dc,
("Degree", "Betweenness"): corr_db,
("Closeness", "Betweenness"): corr_cb,
}
lowest_pair, lowest_value = min(pair_corr.items(), key=lambda x: x[1])
highest_pair, highest_value = max(pair_corr.items(), key=lambda x: x[1])
print(
f"\nLowest-correlation pair: {lowest_pair[0]} vs {lowest_pair[1]} "
f"(r = {lowest_value:.4f})"
)
print(
f"Highest-correlation pair: {highest_pair[0]} vs {highest_pair[1]} "
f"(r = {highest_value:.4f})"
)
explanations = {
frozenset(("Degree", "Closeness")): (
"Degree is local (immediate neighbors), while closeness captures "
"global shortest-path proximity to all nodes."
),
frozenset(("Degree", "Betweenness")): (
"High degree does not always imply bridge-like behavior; betweenness "
"emphasizes control over shortest paths across communities."
),
frozenset(("Closeness", "Betweenness")): (
"Closeness rewards overall proximity, while betweenness rewards "
"being on critical routes between other nodes."
),
}
print(f"Interpretation: {explanations[frozenset(lowest_pair)]}")
print(
"Correlation quality note: values closer to 1 indicate stronger agreement "
"between ranking-based notions of node importance."
)
metrics = {"Degree": degree, "Closeness": closeness, "Betweenness": betweenness}
top_nodes_by_metric = {}
for metric_name, score_map in metrics.items():
print(f"\nTop 10 Papers for {metric_name} (ID<TAB>Title<TAB>Score):")
top_10 = sorted(score_map.items(), key=lambda x: x[1], reverse=True)[:10]
top_nodes_by_metric[metric_name] = [node_id for node_id, _ in top_10]
for node_id, _ in top_10:
title = LCC.nodes[node_id].get("title", "Unknown Title")
print(f"{node_id}\t{title}\t{score_map[node_id]:.6f}")
# Identify papers that appear in multiple top-10 lists (robust centrality evidence).
top_presence = {}
for metric_name, node_ids in top_nodes_by_metric.items():
for node_id in node_ids:
if node_id not in top_presence:
top_presence[node_id] = []
top_presence[node_id].append(metric_name)
repeated = [
(node_id, sorted(metric_names))
for node_id, metric_names in top_presence.items()
if len(metric_names) >= 2
]
repeated.sort(key=lambda x: (-len(x[1]), x[0]))
if repeated:
print("\nPapers repeated across multiple centrality top-10 lists:")
for node_id, metric_names in repeated:
title = LCC.nodes[node_id].get("title", "Unknown Title")
print(f"{node_id}\t{title}\tappears in: {', '.join(metric_names)}")
else:
print("\nNo paper appears in more than one top-10 centrality list.")
# Part 3: Research Evolution (Optional Extra Credit)
def research_evolution_analysis(G, num_topics=5):
print("\n--- Optional: Research Evolution Analysis ---")
before_nodes = [n for n, d in G.nodes(data=True) if _safe_int_year(d.get("year")) < 2023]
after_nodes = [n for n, d in G.nodes(data=True) if _safe_int_year(d.get("year")) >= 2023]
before_docs = []
for n in before_nodes:
title = G.nodes[n].get("title", "")
abstract = G.nodes[n].get("abstract", "")
text = f"{title} {abstract}".strip()
tokens = _tokenize(text) if text else []
if tokens:
before_docs.append(tokens)
after_docs = []
for n in after_nodes:
title = G.nodes[n].get("title", "")
abstract = G.nodes[n].get("abstract", "")
text = f"{title} {abstract}".strip()
tokens = _tokenize(text) if text else []
if tokens:
after_docs.append(tokens)
if not before_docs or not after_docs:
print("Insufficient tokenized documents before/after 2023 for topic comparison.")
return
# Shared dictionary gives a single global vocabulary n for both matrices.
dictionary = Dictionary(before_docs + after_docs)
dictionary.filter_extremes(no_below=2, no_above=0.5, keep_n=5000)
if len(dictionary) == 0:
print("Vocabulary became empty after filtering; skipping extra credit analysis.")
return
before_corpus = [dictionary.doc2bow(doc) for doc in before_docs]
after_corpus = [dictionary.doc2bow(doc) for doc in after_docs]
before_corpus = [bow for bow in before_corpus if bow]
after_corpus = [bow for bow in after_corpus if bow]
if not before_corpus or not after_corpus:
print("Insufficient BOW documents after vocabulary filtering.")
return
lda_before = LdaModel(
corpus=before_corpus, id2word=dictionary, num_topics=num_topics, passes=10, random_state=42
)
lda_after = LdaModel(
corpus=after_corpus, id2word=dictionary, num_topics=num_topics, passes=10, random_state=42
)
# D and S correspond to topic-term probability matrices with shared vocabulary.
D = lda_before.get_topics() # shape: (k1, n)
S = lda_after.get_topics() # shape: (k2, n)
print(f"D matrix shape (before): {D.shape}")
print(f"S matrix shape (after): {S.shape}")
def cosine_similarity(a, b):
denom = np.linalg.norm(a) * np.linalg.norm(b)
if denom == 0:
return 0.0
return float(np.dot(a, b) / denom)
before_shift = []
for i in range(D.shape[0]):
sims = [cosine_similarity(D[i], S[j]) for j in range(S.shape[0])]
before_shift.append((i, 1.0 - max(sims) if sims else 1.0))
after_shift = []
for j in range(S.shape[0]):
sims = [cosine_similarity(S[j], D[i]) for i in range(D.shape[0])]
after_shift.append((j, 1.0 - max(sims) if sims else 1.0))
before_shift.sort(key=lambda x: x[1], reverse=True)
after_shift.sort(key=lambda x: x[1], reverse=True)
def top_words(topic_vec, topn=8):
idx = np.argsort(topic_vec)[::-1][:topn]
return ", ".join(dictionary[i] for i in idx)
print("\nPotentially disappearing themes (before topics with largest shift):")
for topic_id, shift_score in before_shift:
print(f"Before Topic {topic_id} | shift={shift_score:.4f} | {top_words(D[topic_id])}")
print("\nPotentially emerging themes (after topics with largest shift):")
for topic_id, shift_score in after_shift:
print(f"After Topic {topic_id} | shift={shift_score:.4f} | {top_words(S[topic_id])}")
def main():
try:
G = nx.read_graphml("aclbib.graphml")
except Exception as e:
print(f"Error loading graph file: {e}")
return
LCC_nodes = max(nx.connected_components(G), key=len)
LCC = G.subgraph(LCC_nodes).copy()
print(
f"Network loaded. LCC contains {len(LCC.nodes())} nodes and "
f"{len(LCC.edges())} edges."
)
weaktie_analysis(LCC)
centrality_analysis(LCC)
research_evolution_analysis(G)
if __name__ == "__main__":
main() |