shahidul034's picture
Add files using upload-large-folder tool
1db7196 verified
# Author: Md. Shahidul Salim
# Date: February 12, 2026
import networkx as nx
import pandas as pd
from scipy.stats import pearsonr
import numpy as np
import matplotlib.pyplot as plt
# Extra credit imports
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
import nltk
from nltk.tokenize import word_tokenize
# Ensure NLTK resources are available
try:
nltk.data.find("tokenizers/punkt")
except LookupError:
nltk.download("punkt", quiet=True)
try:
nltk.data.find("tokenizers/punkt_tab")
except LookupError:
# Required by some NLTK versions.
nltk.download("punkt_tab", quiet=True)
def _safe_int_year(value):
try:
return int(value)
except (TypeError, ValueError):
return 0
def _rank_vector(scores, node_order):
"""
Convert centrality scores to rank vectors (1 = highest centrality),
which matches the assignment requirement to correlate rankings.
"""
series = pd.Series({node: scores[node] for node in node_order})
ranks = series.rank(method="average", ascending=False)
return [float(ranks[node]) for node in node_order]
def _tokenize(text):
tokens = word_tokenize(text.lower())
return [tok for tok in tokens if tok.isalpha() and len(tok) > 2]
# Part 1: Weak Tie Analysis
def weaktie_analysis(LCC):
print("\n--- Starting Weak/Strong Tie Analysis ---")
edges_asc = sorted(
LCC.edges(data=True), key=lambda x: float(x[2].get("weight", 0.0))
)
edges_desc = list(reversed(edges_asc))
edge_weights = [float(data.get("weight", 0.0)) for _, _, data in edges_asc]
total_edges = len(edge_weights)
if total_edges == 0:
print("No ties found in the LCC; skipping weak/strong tie removal analysis.")
return
# Use median edge weight as the cutoff:
# weak ties: weight <= median, strong ties: weight > median.
median_weight = float(np.median(edge_weights))
weak_ties = [(u, v, d) for u, v, d in edges_asc if float(d.get("weight", 0.0)) <= median_weight]
strong_ties = [(u, v, d) for u, v, d in edges_asc if float(d.get("weight", 0.0)) > median_weight]
print(f"Total ties in LCC: {total_edges}")
print(f"Weak tie threshold (median weight): {median_weight:.4f}")
print(f"Number of weak ties (weight <= {median_weight:.4f}): {len(weak_ties)}")
print(f"Number of strong ties (weight > {median_weight:.4f}): {len(strong_ties)}")
print("Methodology: remove one tie per step and recompute LCC size after each removal.")
def get_lcc_sizes_by_single_removal(edge_list):
temp_graph = LCC.copy()
total_edges = len(edge_list)
fractions_removed = [0.0]
lcc_sizes = [len(max(nx.connected_components(temp_graph), key=len))]
for idx, (u, v, _) in enumerate(edge_list, start=1):
if temp_graph.has_edge(u, v):
temp_graph.remove_edge(u, v)
if temp_graph.number_of_nodes() > 0:
current_lcc = max(nx.connected_components(temp_graph), key=len)
lcc_sizes.append(len(current_lcc))
else:
lcc_sizes.append(0)
fractions_removed.append(idx / total_edges)
return fractions_removed, lcc_sizes
x_weak, y_weak = get_lcc_sizes_by_single_removal(edges_asc)
x_strong, y_strong = get_lcc_sizes_by_single_removal(edges_desc)
plt.figure(figsize=(10, 6))
plt.plot(x_weak, y_weak, label="Removing Weakest First")
plt.plot(x_strong, y_strong, label="Removing Strongest First")
plt.xlabel("Fraction of Ties Removed")
plt.ylabel("LCC Size (Number of Nodes)")
plt.title("Impact of Weak vs Strong Tie Removal on LCC")
plt.legend()
plt.grid(True, linestyle="--", alpha=0.7)
plt.tight_layout()
plt.show()
# Part 2: Centrality Analysis
def centrality_analysis(LCC):
print("\n--- Starting Centrality Analysis ---")
degree = nx.degree_centrality(LCC)
closeness = nx.closeness_centrality(LCC)
betweenness = nx.betweenness_centrality(LCC)
nodes = list(LCC.nodes())
d_rank = _rank_vector(degree, nodes)
c_rank = _rank_vector(closeness, nodes)
b_rank = _rank_vector(betweenness, nodes)
corr_dc, _ = pearsonr(d_rank, c_rank)
corr_db, _ = pearsonr(d_rank, b_rank)
corr_cb, _ = pearsonr(c_rank, b_rank)
print("\nTable 1: Pearson Correlation between Centrality Measure Rankings")
table = pd.DataFrame(
{
"Metric": ["Degree", "Closeness", "Betweenness"],
"Degree": [1.0, corr_dc, corr_db],
"Closeness": [corr_dc, 1.0, corr_cb],
"Betweenness": [corr_db, corr_cb, 1.0],
}
)
print(table.to_string(index=False, float_format=lambda x: f"{x:.4f}"))
pair_corr = {
("Degree", "Closeness"): corr_dc,
("Degree", "Betweenness"): corr_db,
("Closeness", "Betweenness"): corr_cb,
}
lowest_pair, lowest_value = min(pair_corr.items(), key=lambda x: x[1])
highest_pair, highest_value = max(pair_corr.items(), key=lambda x: x[1])
print(
f"\nLowest-correlation pair: {lowest_pair[0]} vs {lowest_pair[1]} "
f"(r = {lowest_value:.4f})"
)
print(
f"Highest-correlation pair: {highest_pair[0]} vs {highest_pair[1]} "
f"(r = {highest_value:.4f})"
)
explanations = {
frozenset(("Degree", "Closeness")): (
"Degree is local (immediate neighbors), while closeness captures "
"global shortest-path proximity to all nodes."
),
frozenset(("Degree", "Betweenness")): (
"High degree does not always imply bridge-like behavior; betweenness "
"emphasizes control over shortest paths across communities."
),
frozenset(("Closeness", "Betweenness")): (
"Closeness rewards overall proximity, while betweenness rewards "
"being on critical routes between other nodes."
),
}
print(f"Interpretation: {explanations[frozenset(lowest_pair)]}")
print(
"Correlation quality note: values closer to 1 indicate stronger agreement "
"between ranking-based notions of node importance."
)
metrics = {"Degree": degree, "Closeness": closeness, "Betweenness": betweenness}
top_nodes_by_metric = {}
for metric_name, score_map in metrics.items():
print(f"\nTop 10 Papers for {metric_name} (ID<TAB>Title<TAB>Score):")
top_10 = sorted(score_map.items(), key=lambda x: x[1], reverse=True)[:10]
top_nodes_by_metric[metric_name] = [node_id for node_id, _ in top_10]
for node_id, _ in top_10:
title = LCC.nodes[node_id].get("title", "Unknown Title")
print(f"{node_id}\t{title}\t{score_map[node_id]:.6f}")
# Identify papers that appear in multiple top-10 lists (robust centrality evidence).
top_presence = {}
for metric_name, node_ids in top_nodes_by_metric.items():
for node_id in node_ids:
if node_id not in top_presence:
top_presence[node_id] = []
top_presence[node_id].append(metric_name)
repeated = [
(node_id, sorted(metric_names))
for node_id, metric_names in top_presence.items()
if len(metric_names) >= 2
]
repeated.sort(key=lambda x: (-len(x[1]), x[0]))
if repeated:
print("\nPapers repeated across multiple centrality top-10 lists:")
for node_id, metric_names in repeated:
title = LCC.nodes[node_id].get("title", "Unknown Title")
print(f"{node_id}\t{title}\tappears in: {', '.join(metric_names)}")
else:
print("\nNo paper appears in more than one top-10 centrality list.")
# Part 3: Research Evolution (Optional Extra Credit)
def research_evolution_analysis(G, num_topics=5):
print("\n--- Optional: Research Evolution Analysis ---")
before_nodes = [n for n, d in G.nodes(data=True) if _safe_int_year(d.get("year")) < 2023]
after_nodes = [n for n, d in G.nodes(data=True) if _safe_int_year(d.get("year")) >= 2023]
before_docs = []
for n in before_nodes:
title = G.nodes[n].get("title", "")
abstract = G.nodes[n].get("abstract", "")
text = f"{title} {abstract}".strip()
tokens = _tokenize(text) if text else []
if tokens:
before_docs.append(tokens)
after_docs = []
for n in after_nodes:
title = G.nodes[n].get("title", "")
abstract = G.nodes[n].get("abstract", "")
text = f"{title} {abstract}".strip()
tokens = _tokenize(text) if text else []
if tokens:
after_docs.append(tokens)
if not before_docs or not after_docs:
print("Insufficient tokenized documents before/after 2023 for topic comparison.")
return
# Shared dictionary gives a single global vocabulary n for both matrices.
dictionary = Dictionary(before_docs + after_docs)
dictionary.filter_extremes(no_below=2, no_above=0.5, keep_n=5000)
if len(dictionary) == 0:
print("Vocabulary became empty after filtering; skipping extra credit analysis.")
return
before_corpus = [dictionary.doc2bow(doc) for doc in before_docs]
after_corpus = [dictionary.doc2bow(doc) for doc in after_docs]
before_corpus = [bow for bow in before_corpus if bow]
after_corpus = [bow for bow in after_corpus if bow]
if not before_corpus or not after_corpus:
print("Insufficient BOW documents after vocabulary filtering.")
return
lda_before = LdaModel(
corpus=before_corpus, id2word=dictionary, num_topics=num_topics, passes=10, random_state=42
)
lda_after = LdaModel(
corpus=after_corpus, id2word=dictionary, num_topics=num_topics, passes=10, random_state=42
)
# D and S correspond to topic-term probability matrices with shared vocabulary.
D = lda_before.get_topics() # shape: (k1, n)
S = lda_after.get_topics() # shape: (k2, n)
print(f"D matrix shape (before): {D.shape}")
print(f"S matrix shape (after): {S.shape}")
def cosine_similarity(a, b):
denom = np.linalg.norm(a) * np.linalg.norm(b)
if denom == 0:
return 0.0
return float(np.dot(a, b) / denom)
before_shift = []
for i in range(D.shape[0]):
sims = [cosine_similarity(D[i], S[j]) for j in range(S.shape[0])]
before_shift.append((i, 1.0 - max(sims) if sims else 1.0))
after_shift = []
for j in range(S.shape[0]):
sims = [cosine_similarity(S[j], D[i]) for i in range(D.shape[0])]
after_shift.append((j, 1.0 - max(sims) if sims else 1.0))
before_shift.sort(key=lambda x: x[1], reverse=True)
after_shift.sort(key=lambda x: x[1], reverse=True)
def top_words(topic_vec, topn=8):
idx = np.argsort(topic_vec)[::-1][:topn]
return ", ".join(dictionary[i] for i in idx)
print("\nPotentially disappearing themes (before topics with largest shift):")
for topic_id, shift_score in before_shift:
print(f"Before Topic {topic_id} | shift={shift_score:.4f} | {top_words(D[topic_id])}")
print("\nPotentially emerging themes (after topics with largest shift):")
for topic_id, shift_score in after_shift:
print(f"After Topic {topic_id} | shift={shift_score:.4f} | {top_words(S[topic_id])}")
def main():
try:
G = nx.read_graphml("aclbib.graphml")
except Exception as e:
print(f"Error loading graph file: {e}")
return
LCC_nodes = max(nx.connected_components(G), key=len)
LCC = G.subgraph(LCC_nodes).copy()
print(
f"Network loaded. LCC contains {len(LCC.nodes())} nodes and "
f"{len(LCC.edges())} edges."
)
weaktie_analysis(LCC)
centrality_analysis(LCC)
research_evolution_analysis(G)
if __name__ == "__main__":
main()