Add files using upload-large-folder tool

1db7196 verified about 1 month ago

11.8 kB

	# Author: Md. Shahidul Salim
	# Date: February 12, 2026

	import networkx as nx
	import pandas as pd
	from scipy.stats import pearsonr
	import numpy as np
	import matplotlib.pyplot as plt

	# Extra credit imports
	from gensim.models.ldamodel import LdaModel
	from gensim.corpora.dictionary import Dictionary
	import nltk
	from nltk.tokenize import word_tokenize


	# Ensure NLTK resources are available
	try:
	nltk.data.find("tokenizers/punkt")
	except LookupError:
	nltk.download("punkt", quiet=True)

	try:
	nltk.data.find("tokenizers/punkt_tab")
	except LookupError:
	# Required by some NLTK versions.
	nltk.download("punkt_tab", quiet=True)


	def _safe_int_year(value):
	try:
	return int(value)
	except (TypeError, ValueError):
	return 0


	def _rank_vector(scores, node_order):
	"""
	Convert centrality scores to rank vectors (1 = highest centrality),
	which matches the assignment requirement to correlate rankings.
	"""
	series = pd.Series({node: scores[node] for node in node_order})
	ranks = series.rank(method="average", ascending=False)
	return [float(ranks[node]) for node in node_order]


	def _tokenize(text):
	tokens = word_tokenize(text.lower())
	return [tok for tok in tokens if tok.isalpha() and len(tok) > 2]


	# Part 1: Weak Tie Analysis
	def weaktie_analysis(LCC):
	print("\n--- Starting Weak/Strong Tie Analysis ---")
	edges_asc = sorted(
	LCC.edges(data=True), key=lambda x: float(x[2].get("weight", 0.0))
	)
	edges_desc = list(reversed(edges_asc))
	edge_weights = [float(data.get("weight", 0.0)) for _, _, data in edges_asc]
	total_edges = len(edge_weights)

	if total_edges == 0:
	print("No ties found in the LCC; skipping weak/strong tie removal analysis.")
	return

	# Use median edge weight as the cutoff:
	# weak ties: weight <= median, strong ties: weight > median.
	median_weight = float(np.median(edge_weights))
	weak_ties = [(u, v, d) for u, v, d in edges_asc if float(d.get("weight", 0.0)) <= median_weight]
	strong_ties = [(u, v, d) for u, v, d in edges_asc if float(d.get("weight", 0.0)) > median_weight]

	print(f"Total ties in LCC: {total_edges}")
	print(f"Weak tie threshold (median weight): {median_weight:.4f}")
	print(f"Number of weak ties (weight <= {median_weight:.4f}): {len(weak_ties)}")
	print(f"Number of strong ties (weight > {median_weight:.4f}): {len(strong_ties)}")
	print("Methodology: remove one tie per step and recompute LCC size after each removal.")

	def get_lcc_sizes_by_single_removal(edge_list):
	temp_graph = LCC.copy()
	total_edges = len(edge_list)
	fractions_removed = [0.0]
	lcc_sizes = [len(max(nx.connected_components(temp_graph), key=len))]

	for idx, (u, v, _) in enumerate(edge_list, start=1):
	if temp_graph.has_edge(u, v):
	temp_graph.remove_edge(u, v)

	if temp_graph.number_of_nodes() > 0:
	current_lcc = max(nx.connected_components(temp_graph), key=len)
	lcc_sizes.append(len(current_lcc))
	else:
	lcc_sizes.append(0)
	fractions_removed.append(idx / total_edges)

	return fractions_removed, lcc_sizes

	x_weak, y_weak = get_lcc_sizes_by_single_removal(edges_asc)
	x_strong, y_strong = get_lcc_sizes_by_single_removal(edges_desc)

	plt.figure(figsize=(10, 6))
	plt.plot(x_weak, y_weak, label="Removing Weakest First")
	plt.plot(x_strong, y_strong, label="Removing Strongest First")
	plt.xlabel("Fraction of Ties Removed")
	plt.ylabel("LCC Size (Number of Nodes)")
	plt.title("Impact of Weak vs Strong Tie Removal on LCC")
	plt.legend()
	plt.grid(True, linestyle="--", alpha=0.7)
	plt.tight_layout()
	plt.show()


	# Part 2: Centrality Analysis
	def centrality_analysis(LCC):
	print("\n--- Starting Centrality Analysis ---")

	degree = nx.degree_centrality(LCC)
	closeness = nx.closeness_centrality(LCC)
	betweenness = nx.betweenness_centrality(LCC)

	nodes = list(LCC.nodes())
	d_rank = _rank_vector(degree, nodes)
	c_rank = _rank_vector(closeness, nodes)
	b_rank = _rank_vector(betweenness, nodes)

	corr_dc, _ = pearsonr(d_rank, c_rank)
	corr_db, _ = pearsonr(d_rank, b_rank)
	corr_cb, _ = pearsonr(c_rank, b_rank)

	print("\nTable 1: Pearson Correlation between Centrality Measure Rankings")
	table = pd.DataFrame(
	{
	"Metric": ["Degree", "Closeness", "Betweenness"],
	"Degree": [1.0, corr_dc, corr_db],
	"Closeness": [corr_dc, 1.0, corr_cb],
	"Betweenness": [corr_db, corr_cb, 1.0],
	}
	)
	print(table.to_string(index=False, float_format=lambda x: f"{x:.4f}"))

	pair_corr = {
	("Degree", "Closeness"): corr_dc,
	("Degree", "Betweenness"): corr_db,
	("Closeness", "Betweenness"): corr_cb,
	}
	lowest_pair, lowest_value = min(pair_corr.items(), key=lambda x: x[1])
	highest_pair, highest_value = max(pair_corr.items(), key=lambda x: x[1])
	print(
	f"\nLowest-correlation pair: {lowest_pair[0]} vs {lowest_pair[1]} "
	f"(r = {lowest_value:.4f})"
	)
	print(
	f"Highest-correlation pair: {highest_pair[0]} vs {highest_pair[1]} "
	f"(r = {highest_value:.4f})"
	)

	explanations = {
	frozenset(("Degree", "Closeness")): (
	"Degree is local (immediate neighbors), while closeness captures "
	"global shortest-path proximity to all nodes."
	),
	frozenset(("Degree", "Betweenness")): (
	"High degree does not always imply bridge-like behavior; betweenness "
	"emphasizes control over shortest paths across communities."
	),
	frozenset(("Closeness", "Betweenness")): (
	"Closeness rewards overall proximity, while betweenness rewards "
	"being on critical routes between other nodes."
	),
	}
	print(f"Interpretation: {explanations[frozenset(lowest_pair)]}")
	print(
	"Correlation quality note: values closer to 1 indicate stronger agreement "
	"between ranking-based notions of node importance."
	)

	metrics = {"Degree": degree, "Closeness": closeness, "Betweenness": betweenness}
	top_nodes_by_metric = {}
	for metric_name, score_map in metrics.items():
	print(f"\nTop 10 Papers for {metric_name} (ID<TAB>Title<TAB>Score):")
	top_10 = sorted(score_map.items(), key=lambda x: x[1], reverse=True)[:10]
	top_nodes_by_metric[metric_name] = [node_id for node_id, _ in top_10]
	for node_id, _ in top_10:
	title = LCC.nodes[node_id].get("title", "Unknown Title")
	print(f"{node_id}\t{title}\t{score_map[node_id]:.6f}")

	# Identify papers that appear in multiple top-10 lists (robust centrality evidence).
	top_presence = {}
	for metric_name, node_ids in top_nodes_by_metric.items():
	for node_id in node_ids:
	if node_id not in top_presence:
	top_presence[node_id] = []
	top_presence[node_id].append(metric_name)

	repeated = [
	(node_id, sorted(metric_names))
	for node_id, metric_names in top_presence.items()
	if len(metric_names) >= 2
	]
	repeated.sort(key=lambda x: (-len(x[1]), x[0]))

	if repeated:
	print("\nPapers repeated across multiple centrality top-10 lists:")
	for node_id, metric_names in repeated:
	title = LCC.nodes[node_id].get("title", "Unknown Title")
	print(f"{node_id}\t{title}\tappears in: {', '.join(metric_names)}")
	else:
	print("\nNo paper appears in more than one top-10 centrality list.")


	# Part 3: Research Evolution (Optional Extra Credit)
	def research_evolution_analysis(G, num_topics=5):
	print("\n--- Optional: Research Evolution Analysis ---")

	before_nodes = [n for n, d in G.nodes(data=True) if _safe_int_year(d.get("year")) < 2023]
	after_nodes = [n for n, d in G.nodes(data=True) if _safe_int_year(d.get("year")) >= 2023]

	before_docs = []
	for n in before_nodes:
	title = G.nodes[n].get("title", "")
	abstract = G.nodes[n].get("abstract", "")
	text = f"{title} {abstract}".strip()
	tokens = _tokenize(text) if text else []
	if tokens:
	before_docs.append(tokens)

	after_docs = []
	for n in after_nodes:
	title = G.nodes[n].get("title", "")
	abstract = G.nodes[n].get("abstract", "")
	text = f"{title} {abstract}".strip()
	tokens = _tokenize(text) if text else []
	if tokens:
	after_docs.append(tokens)

	if not before_docs or not after_docs:
	print("Insufficient tokenized documents before/after 2023 for topic comparison.")
	return

	# Shared dictionary gives a single global vocabulary n for both matrices.
	dictionary = Dictionary(before_docs + after_docs)
	dictionary.filter_extremes(no_below=2, no_above=0.5, keep_n=5000)
	if len(dictionary) == 0:
	print("Vocabulary became empty after filtering; skipping extra credit analysis.")
	return

	before_corpus = [dictionary.doc2bow(doc) for doc in before_docs]
	after_corpus = [dictionary.doc2bow(doc) for doc in after_docs]
	before_corpus = [bow for bow in before_corpus if bow]
	after_corpus = [bow for bow in after_corpus if bow]

	if not before_corpus or not after_corpus:
	print("Insufficient BOW documents after vocabulary filtering.")
	return

	lda_before = LdaModel(
	corpus=before_corpus, id2word=dictionary, num_topics=num_topics, passes=10, random_state=42
	)
	lda_after = LdaModel(
	corpus=after_corpus, id2word=dictionary, num_topics=num_topics, passes=10, random_state=42
	)

	# D and S correspond to topic-term probability matrices with shared vocabulary.
	D = lda_before.get_topics() # shape: (k1, n)
	S = lda_after.get_topics() # shape: (k2, n)
	print(f"D matrix shape (before): {D.shape}")
	print(f"S matrix shape (after): {S.shape}")

	def cosine_similarity(a, b):
	denom = np.linalg.norm(a) * np.linalg.norm(b)
	if denom == 0:
	return 0.0
	return float(np.dot(a, b) / denom)

	before_shift = []
	for i in range(D.shape[0]):
	sims = [cosine_similarity(D[i], S[j]) for j in range(S.shape[0])]
	before_shift.append((i, 1.0 - max(sims) if sims else 1.0))

	after_shift = []
	for j in range(S.shape[0]):
	sims = [cosine_similarity(S[j], D[i]) for i in range(D.shape[0])]
	after_shift.append((j, 1.0 - max(sims) if sims else 1.0))

	before_shift.sort(key=lambda x: x[1], reverse=True)
	after_shift.sort(key=lambda x: x[1], reverse=True)

	def top_words(topic_vec, topn=8):
	idx = np.argsort(topic_vec)[::-1][:topn]
	return ", ".join(dictionary[i] for i in idx)

	print("\nPotentially disappearing themes (before topics with largest shift):")
	for topic_id, shift_score in before_shift:
	print(f"Before Topic {topic_id} \| shift={shift_score:.4f} \| {top_words(D[topic_id])}")

	print("\nPotentially emerging themes (after topics with largest shift):")
	for topic_id, shift_score in after_shift:
	print(f"After Topic {topic_id} \| shift={shift_score:.4f} \| {top_words(S[topic_id])}")


	def main():
	try:
	G = nx.read_graphml("aclbib.graphml")
	except Exception as e:
	print(f"Error loading graph file: {e}")
	return

	LCC_nodes = max(nx.connected_components(G), key=len)
	LCC = G.subgraph(LCC_nodes).copy()
	print(
	f"Network loaded. LCC contains {len(LCC.nodes())} nodes and "
	f"{len(LCC.edges())} edges."
	)

	weaktie_analysis(LCC)
	centrality_analysis(LCC)
	research_evolution_analysis(G)


	if __name__ == "__main__":
	main()