# Author: Md. Shahidul Salim # Date: February 12, 2026 import networkx as nx import pandas as pd from scipy.stats import pearsonr import numpy as np import matplotlib.pyplot as plt # Extra credit imports from gensim.models.ldamodel import LdaModel from gensim.corpora.dictionary import Dictionary import nltk from nltk.tokenize import word_tokenize # Ensure NLTK resources are available try: nltk.data.find("tokenizers/punkt") except LookupError: nltk.download("punkt", quiet=True) try: nltk.data.find("tokenizers/punkt_tab") except LookupError: # Required by some NLTK versions. nltk.download("punkt_tab", quiet=True) def _safe_int_year(value): try: return int(value) except (TypeError, ValueError): return 0 def _rank_vector(scores, node_order): """ Convert centrality scores to rank vectors (1 = highest centrality), which matches the assignment requirement to correlate rankings. """ series = pd.Series({node: scores[node] for node in node_order}) ranks = series.rank(method="average", ascending=False) return [float(ranks[node]) for node in node_order] def _tokenize(text): tokens = word_tokenize(text.lower()) return [tok for tok in tokens if tok.isalpha() and len(tok) > 2] # Part 1: Weak Tie Analysis def weaktie_analysis(LCC): print("\n--- Starting Weak/Strong Tie Analysis ---") edges_asc = sorted( LCC.edges(data=True), key=lambda x: float(x[2].get("weight", 0.0)) ) edges_desc = list(reversed(edges_asc)) edge_weights = [float(data.get("weight", 0.0)) for _, _, data in edges_asc] total_edges = len(edge_weights) if total_edges == 0: print("No ties found in the LCC; skipping weak/strong tie removal analysis.") return # Use median edge weight as the cutoff: # weak ties: weight <= median, strong ties: weight > median. median_weight = float(np.median(edge_weights)) weak_ties = [(u, v, d) for u, v, d in edges_asc if float(d.get("weight", 0.0)) <= median_weight] strong_ties = [(u, v, d) for u, v, d in edges_asc if float(d.get("weight", 0.0)) > median_weight] print(f"Total ties in LCC: {total_edges}") print(f"Weak tie threshold (median weight): {median_weight:.4f}") print(f"Number of weak ties (weight <= {median_weight:.4f}): {len(weak_ties)}") print(f"Number of strong ties (weight > {median_weight:.4f}): {len(strong_ties)}") print("Methodology: remove one tie per step and recompute LCC size after each removal.") def get_lcc_sizes_by_single_removal(edge_list): temp_graph = LCC.copy() total_edges = len(edge_list) fractions_removed = [0.0] lcc_sizes = [len(max(nx.connected_components(temp_graph), key=len))] for idx, (u, v, _) in enumerate(edge_list, start=1): if temp_graph.has_edge(u, v): temp_graph.remove_edge(u, v) if temp_graph.number_of_nodes() > 0: current_lcc = max(nx.connected_components(temp_graph), key=len) lcc_sizes.append(len(current_lcc)) else: lcc_sizes.append(0) fractions_removed.append(idx / total_edges) return fractions_removed, lcc_sizes x_weak, y_weak = get_lcc_sizes_by_single_removal(edges_asc) x_strong, y_strong = get_lcc_sizes_by_single_removal(edges_desc) plt.figure(figsize=(10, 6)) plt.plot(x_weak, y_weak, label="Removing Weakest First") plt.plot(x_strong, y_strong, label="Removing Strongest First") plt.xlabel("Fraction of Ties Removed") plt.ylabel("LCC Size (Number of Nodes)") plt.title("Impact of Weak vs Strong Tie Removal on LCC") plt.legend() plt.grid(True, linestyle="--", alpha=0.7) plt.tight_layout() plt.show() # Part 2: Centrality Analysis def centrality_analysis(LCC): print("\n--- Starting Centrality Analysis ---") degree = nx.degree_centrality(LCC) closeness = nx.closeness_centrality(LCC) betweenness = nx.betweenness_centrality(LCC) nodes = list(LCC.nodes()) d_rank = _rank_vector(degree, nodes) c_rank = _rank_vector(closeness, nodes) b_rank = _rank_vector(betweenness, nodes) corr_dc, _ = pearsonr(d_rank, c_rank) corr_db, _ = pearsonr(d_rank, b_rank) corr_cb, _ = pearsonr(c_rank, b_rank) print("\nTable 1: Pearson Correlation between Centrality Measure Rankings") table = pd.DataFrame( { "Metric": ["Degree", "Closeness", "Betweenness"], "Degree": [1.0, corr_dc, corr_db], "Closeness": [corr_dc, 1.0, corr_cb], "Betweenness": [corr_db, corr_cb, 1.0], } ) print(table.to_string(index=False, float_format=lambda x: f"{x:.4f}")) pair_corr = { ("Degree", "Closeness"): corr_dc, ("Degree", "Betweenness"): corr_db, ("Closeness", "Betweenness"): corr_cb, } lowest_pair, lowest_value = min(pair_corr.items(), key=lambda x: x[1]) highest_pair, highest_value = max(pair_corr.items(), key=lambda x: x[1]) print( f"\nLowest-correlation pair: {lowest_pair[0]} vs {lowest_pair[1]} " f"(r = {lowest_value:.4f})" ) print( f"Highest-correlation pair: {highest_pair[0]} vs {highest_pair[1]} " f"(r = {highest_value:.4f})" ) explanations = { frozenset(("Degree", "Closeness")): ( "Degree is local (immediate neighbors), while closeness captures " "global shortest-path proximity to all nodes." ), frozenset(("Degree", "Betweenness")): ( "High degree does not always imply bridge-like behavior; betweenness " "emphasizes control over shortest paths across communities." ), frozenset(("Closeness", "Betweenness")): ( "Closeness rewards overall proximity, while betweenness rewards " "being on critical routes between other nodes." ), } print(f"Interpretation: {explanations[frozenset(lowest_pair)]}") print( "Correlation quality note: values closer to 1 indicate stronger agreement " "between ranking-based notions of node importance." ) metrics = {"Degree": degree, "Closeness": closeness, "Betweenness": betweenness} top_nodes_by_metric = {} for metric_name, score_map in metrics.items(): print(f"\nTop 10 Papers for {metric_name} (IDTitleScore):") top_10 = sorted(score_map.items(), key=lambda x: x[1], reverse=True)[:10] top_nodes_by_metric[metric_name] = [node_id for node_id, _ in top_10] for node_id, _ in top_10: title = LCC.nodes[node_id].get("title", "Unknown Title") print(f"{node_id}\t{title}\t{score_map[node_id]:.6f}") # Identify papers that appear in multiple top-10 lists (robust centrality evidence). top_presence = {} for metric_name, node_ids in top_nodes_by_metric.items(): for node_id in node_ids: if node_id not in top_presence: top_presence[node_id] = [] top_presence[node_id].append(metric_name) repeated = [ (node_id, sorted(metric_names)) for node_id, metric_names in top_presence.items() if len(metric_names) >= 2 ] repeated.sort(key=lambda x: (-len(x[1]), x[0])) if repeated: print("\nPapers repeated across multiple centrality top-10 lists:") for node_id, metric_names in repeated: title = LCC.nodes[node_id].get("title", "Unknown Title") print(f"{node_id}\t{title}\tappears in: {', '.join(metric_names)}") else: print("\nNo paper appears in more than one top-10 centrality list.") # Part 3: Research Evolution (Optional Extra Credit) def research_evolution_analysis(G, num_topics=5): print("\n--- Optional: Research Evolution Analysis ---") before_nodes = [n for n, d in G.nodes(data=True) if _safe_int_year(d.get("year")) < 2023] after_nodes = [n for n, d in G.nodes(data=True) if _safe_int_year(d.get("year")) >= 2023] before_docs = [] for n in before_nodes: title = G.nodes[n].get("title", "") abstract = G.nodes[n].get("abstract", "") text = f"{title} {abstract}".strip() tokens = _tokenize(text) if text else [] if tokens: before_docs.append(tokens) after_docs = [] for n in after_nodes: title = G.nodes[n].get("title", "") abstract = G.nodes[n].get("abstract", "") text = f"{title} {abstract}".strip() tokens = _tokenize(text) if text else [] if tokens: after_docs.append(tokens) if not before_docs or not after_docs: print("Insufficient tokenized documents before/after 2023 for topic comparison.") return # Shared dictionary gives a single global vocabulary n for both matrices. dictionary = Dictionary(before_docs + after_docs) dictionary.filter_extremes(no_below=2, no_above=0.5, keep_n=5000) if len(dictionary) == 0: print("Vocabulary became empty after filtering; skipping extra credit analysis.") return before_corpus = [dictionary.doc2bow(doc) for doc in before_docs] after_corpus = [dictionary.doc2bow(doc) for doc in after_docs] before_corpus = [bow for bow in before_corpus if bow] after_corpus = [bow for bow in after_corpus if bow] if not before_corpus or not after_corpus: print("Insufficient BOW documents after vocabulary filtering.") return lda_before = LdaModel( corpus=before_corpus, id2word=dictionary, num_topics=num_topics, passes=10, random_state=42 ) lda_after = LdaModel( corpus=after_corpus, id2word=dictionary, num_topics=num_topics, passes=10, random_state=42 ) # D and S correspond to topic-term probability matrices with shared vocabulary. D = lda_before.get_topics() # shape: (k1, n) S = lda_after.get_topics() # shape: (k2, n) print(f"D matrix shape (before): {D.shape}") print(f"S matrix shape (after): {S.shape}") def cosine_similarity(a, b): denom = np.linalg.norm(a) * np.linalg.norm(b) if denom == 0: return 0.0 return float(np.dot(a, b) / denom) before_shift = [] for i in range(D.shape[0]): sims = [cosine_similarity(D[i], S[j]) for j in range(S.shape[0])] before_shift.append((i, 1.0 - max(sims) if sims else 1.0)) after_shift = [] for j in range(S.shape[0]): sims = [cosine_similarity(S[j], D[i]) for i in range(D.shape[0])] after_shift.append((j, 1.0 - max(sims) if sims else 1.0)) before_shift.sort(key=lambda x: x[1], reverse=True) after_shift.sort(key=lambda x: x[1], reverse=True) def top_words(topic_vec, topn=8): idx = np.argsort(topic_vec)[::-1][:topn] return ", ".join(dictionary[i] for i in idx) print("\nPotentially disappearing themes (before topics with largest shift):") for topic_id, shift_score in before_shift: print(f"Before Topic {topic_id} | shift={shift_score:.4f} | {top_words(D[topic_id])}") print("\nPotentially emerging themes (after topics with largest shift):") for topic_id, shift_score in after_shift: print(f"After Topic {topic_id} | shift={shift_score:.4f} | {top_words(S[topic_id])}") def main(): try: G = nx.read_graphml("aclbib.graphml") except Exception as e: print(f"Error loading graph file: {e}") return LCC_nodes = max(nx.connected_components(G), key=len) LCC = G.subgraph(LCC_nodes).copy() print( f"Network loaded. LCC contains {len(LCC.nodes())} nodes and " f"{len(LCC.edges())} edges." ) weaktie_analysis(LCC) centrality_analysis(LCC) research_evolution_analysis(G) if __name__ == "__main__": main()