| |
| |
|
|
| import networkx as nx |
| import pandas as pd |
| from scipy.stats import pearsonr |
| import numpy as np |
| import matplotlib.pyplot as plt |
|
|
| |
| from gensim.models.ldamodel import LdaModel |
| from gensim.corpora.dictionary import Dictionary |
| import nltk |
| from nltk.tokenize import word_tokenize |
|
|
|
|
| |
| try: |
| nltk.data.find("tokenizers/punkt") |
| except LookupError: |
| nltk.download("punkt", quiet=True) |
|
|
| try: |
| nltk.data.find("tokenizers/punkt_tab") |
| except LookupError: |
| |
| nltk.download("punkt_tab", quiet=True) |
|
|
|
|
| def _safe_int_year(value): |
| try: |
| return int(value) |
| except (TypeError, ValueError): |
| return 0 |
|
|
|
|
| def _rank_vector(scores, node_order): |
| """ |
| Convert centrality scores to rank vectors (1 = highest centrality), |
| which matches the assignment requirement to correlate rankings. |
| """ |
| series = pd.Series({node: scores[node] for node in node_order}) |
| ranks = series.rank(method="average", ascending=False) |
| return [float(ranks[node]) for node in node_order] |
|
|
|
|
| def _tokenize(text): |
| tokens = word_tokenize(text.lower()) |
| return [tok for tok in tokens if tok.isalpha() and len(tok) > 2] |
|
|
|
|
| |
| def weaktie_analysis(LCC): |
| print("\n--- Starting Weak/Strong Tie Analysis ---") |
| edges_asc = sorted( |
| LCC.edges(data=True), key=lambda x: float(x[2].get("weight", 0.0)) |
| ) |
| edges_desc = list(reversed(edges_asc)) |
| edge_weights = [float(data.get("weight", 0.0)) for _, _, data in edges_asc] |
| total_edges = len(edge_weights) |
|
|
| if total_edges == 0: |
| print("No ties found in the LCC; skipping weak/strong tie removal analysis.") |
| return |
|
|
| |
| |
| median_weight = float(np.median(edge_weights)) |
| weak_ties = [(u, v, d) for u, v, d in edges_asc if float(d.get("weight", 0.0)) <= median_weight] |
| strong_ties = [(u, v, d) for u, v, d in edges_asc if float(d.get("weight", 0.0)) > median_weight] |
|
|
| print(f"Total ties in LCC: {total_edges}") |
| print(f"Weak tie threshold (median weight): {median_weight:.4f}") |
| print(f"Number of weak ties (weight <= {median_weight:.4f}): {len(weak_ties)}") |
| print(f"Number of strong ties (weight > {median_weight:.4f}): {len(strong_ties)}") |
| print("Methodology: remove one tie per step and recompute LCC size after each removal.") |
|
|
| def get_lcc_sizes_by_single_removal(edge_list): |
| temp_graph = LCC.copy() |
| total_edges = len(edge_list) |
| fractions_removed = [0.0] |
| lcc_sizes = [len(max(nx.connected_components(temp_graph), key=len))] |
|
|
| for idx, (u, v, _) in enumerate(edge_list, start=1): |
| if temp_graph.has_edge(u, v): |
| temp_graph.remove_edge(u, v) |
|
|
| if temp_graph.number_of_nodes() > 0: |
| current_lcc = max(nx.connected_components(temp_graph), key=len) |
| lcc_sizes.append(len(current_lcc)) |
| else: |
| lcc_sizes.append(0) |
| fractions_removed.append(idx / total_edges) |
|
|
| return fractions_removed, lcc_sizes |
|
|
| x_weak, y_weak = get_lcc_sizes_by_single_removal(edges_asc) |
| x_strong, y_strong = get_lcc_sizes_by_single_removal(edges_desc) |
|
|
| plt.figure(figsize=(10, 6)) |
| plt.plot(x_weak, y_weak, label="Removing Weakest First") |
| plt.plot(x_strong, y_strong, label="Removing Strongest First") |
| plt.xlabel("Fraction of Ties Removed") |
| plt.ylabel("LCC Size (Number of Nodes)") |
| plt.title("Impact of Weak vs Strong Tie Removal on LCC") |
| plt.legend() |
| plt.grid(True, linestyle="--", alpha=0.7) |
| plt.tight_layout() |
| plt.show() |
|
|
|
|
| |
| def centrality_analysis(LCC): |
| print("\n--- Starting Centrality Analysis ---") |
|
|
| degree = nx.degree_centrality(LCC) |
| closeness = nx.closeness_centrality(LCC) |
| betweenness = nx.betweenness_centrality(LCC) |
|
|
| nodes = list(LCC.nodes()) |
| d_rank = _rank_vector(degree, nodes) |
| c_rank = _rank_vector(closeness, nodes) |
| b_rank = _rank_vector(betweenness, nodes) |
|
|
| corr_dc, _ = pearsonr(d_rank, c_rank) |
| corr_db, _ = pearsonr(d_rank, b_rank) |
| corr_cb, _ = pearsonr(c_rank, b_rank) |
|
|
| print("\nTable 1: Pearson Correlation between Centrality Measure Rankings") |
| table = pd.DataFrame( |
| { |
| "Metric": ["Degree", "Closeness", "Betweenness"], |
| "Degree": [1.0, corr_dc, corr_db], |
| "Closeness": [corr_dc, 1.0, corr_cb], |
| "Betweenness": [corr_db, corr_cb, 1.0], |
| } |
| ) |
| print(table.to_string(index=False, float_format=lambda x: f"{x:.4f}")) |
|
|
| pair_corr = { |
| ("Degree", "Closeness"): corr_dc, |
| ("Degree", "Betweenness"): corr_db, |
| ("Closeness", "Betweenness"): corr_cb, |
| } |
| lowest_pair, lowest_value = min(pair_corr.items(), key=lambda x: x[1]) |
| highest_pair, highest_value = max(pair_corr.items(), key=lambda x: x[1]) |
| print( |
| f"\nLowest-correlation pair: {lowest_pair[0]} vs {lowest_pair[1]} " |
| f"(r = {lowest_value:.4f})" |
| ) |
| print( |
| f"Highest-correlation pair: {highest_pair[0]} vs {highest_pair[1]} " |
| f"(r = {highest_value:.4f})" |
| ) |
|
|
| explanations = { |
| frozenset(("Degree", "Closeness")): ( |
| "Degree is local (immediate neighbors), while closeness captures " |
| "global shortest-path proximity to all nodes." |
| ), |
| frozenset(("Degree", "Betweenness")): ( |
| "High degree does not always imply bridge-like behavior; betweenness " |
| "emphasizes control over shortest paths across communities." |
| ), |
| frozenset(("Closeness", "Betweenness")): ( |
| "Closeness rewards overall proximity, while betweenness rewards " |
| "being on critical routes between other nodes." |
| ), |
| } |
| print(f"Interpretation: {explanations[frozenset(lowest_pair)]}") |
| print( |
| "Correlation quality note: values closer to 1 indicate stronger agreement " |
| "between ranking-based notions of node importance." |
| ) |
|
|
| metrics = {"Degree": degree, "Closeness": closeness, "Betweenness": betweenness} |
| top_nodes_by_metric = {} |
| for metric_name, score_map in metrics.items(): |
| print(f"\nTop 10 Papers for {metric_name} (ID<TAB>Title<TAB>Score):") |
| top_10 = sorted(score_map.items(), key=lambda x: x[1], reverse=True)[:10] |
| top_nodes_by_metric[metric_name] = [node_id for node_id, _ in top_10] |
| for node_id, _ in top_10: |
| title = LCC.nodes[node_id].get("title", "Unknown Title") |
| print(f"{node_id}\t{title}\t{score_map[node_id]:.6f}") |
|
|
| |
| top_presence = {} |
| for metric_name, node_ids in top_nodes_by_metric.items(): |
| for node_id in node_ids: |
| if node_id not in top_presence: |
| top_presence[node_id] = [] |
| top_presence[node_id].append(metric_name) |
|
|
| repeated = [ |
| (node_id, sorted(metric_names)) |
| for node_id, metric_names in top_presence.items() |
| if len(metric_names) >= 2 |
| ] |
| repeated.sort(key=lambda x: (-len(x[1]), x[0])) |
|
|
| if repeated: |
| print("\nPapers repeated across multiple centrality top-10 lists:") |
| for node_id, metric_names in repeated: |
| title = LCC.nodes[node_id].get("title", "Unknown Title") |
| print(f"{node_id}\t{title}\tappears in: {', '.join(metric_names)}") |
| else: |
| print("\nNo paper appears in more than one top-10 centrality list.") |
|
|
|
|
| |
| def research_evolution_analysis(G, num_topics=5): |
| print("\n--- Optional: Research Evolution Analysis ---") |
|
|
| before_nodes = [n for n, d in G.nodes(data=True) if _safe_int_year(d.get("year")) < 2023] |
| after_nodes = [n for n, d in G.nodes(data=True) if _safe_int_year(d.get("year")) >= 2023] |
|
|
| before_docs = [] |
| for n in before_nodes: |
| title = G.nodes[n].get("title", "") |
| abstract = G.nodes[n].get("abstract", "") |
| text = f"{title} {abstract}".strip() |
| tokens = _tokenize(text) if text else [] |
| if tokens: |
| before_docs.append(tokens) |
|
|
| after_docs = [] |
| for n in after_nodes: |
| title = G.nodes[n].get("title", "") |
| abstract = G.nodes[n].get("abstract", "") |
| text = f"{title} {abstract}".strip() |
| tokens = _tokenize(text) if text else [] |
| if tokens: |
| after_docs.append(tokens) |
|
|
| if not before_docs or not after_docs: |
| print("Insufficient tokenized documents before/after 2023 for topic comparison.") |
| return |
|
|
| |
| dictionary = Dictionary(before_docs + after_docs) |
| dictionary.filter_extremes(no_below=2, no_above=0.5, keep_n=5000) |
| if len(dictionary) == 0: |
| print("Vocabulary became empty after filtering; skipping extra credit analysis.") |
| return |
|
|
| before_corpus = [dictionary.doc2bow(doc) for doc in before_docs] |
| after_corpus = [dictionary.doc2bow(doc) for doc in after_docs] |
| before_corpus = [bow for bow in before_corpus if bow] |
| after_corpus = [bow for bow in after_corpus if bow] |
|
|
| if not before_corpus or not after_corpus: |
| print("Insufficient BOW documents after vocabulary filtering.") |
| return |
|
|
| lda_before = LdaModel( |
| corpus=before_corpus, id2word=dictionary, num_topics=num_topics, passes=10, random_state=42 |
| ) |
| lda_after = LdaModel( |
| corpus=after_corpus, id2word=dictionary, num_topics=num_topics, passes=10, random_state=42 |
| ) |
|
|
| |
| D = lda_before.get_topics() |
| S = lda_after.get_topics() |
| print(f"D matrix shape (before): {D.shape}") |
| print(f"S matrix shape (after): {S.shape}") |
|
|
| def cosine_similarity(a, b): |
| denom = np.linalg.norm(a) * np.linalg.norm(b) |
| if denom == 0: |
| return 0.0 |
| return float(np.dot(a, b) / denom) |
|
|
| before_shift = [] |
| for i in range(D.shape[0]): |
| sims = [cosine_similarity(D[i], S[j]) for j in range(S.shape[0])] |
| before_shift.append((i, 1.0 - max(sims) if sims else 1.0)) |
|
|
| after_shift = [] |
| for j in range(S.shape[0]): |
| sims = [cosine_similarity(S[j], D[i]) for i in range(D.shape[0])] |
| after_shift.append((j, 1.0 - max(sims) if sims else 1.0)) |
|
|
| before_shift.sort(key=lambda x: x[1], reverse=True) |
| after_shift.sort(key=lambda x: x[1], reverse=True) |
|
|
| def top_words(topic_vec, topn=8): |
| idx = np.argsort(topic_vec)[::-1][:topn] |
| return ", ".join(dictionary[i] for i in idx) |
|
|
| print("\nPotentially disappearing themes (before topics with largest shift):") |
| for topic_id, shift_score in before_shift: |
| print(f"Before Topic {topic_id} | shift={shift_score:.4f} | {top_words(D[topic_id])}") |
|
|
| print("\nPotentially emerging themes (after topics with largest shift):") |
| for topic_id, shift_score in after_shift: |
| print(f"After Topic {topic_id} | shift={shift_score:.4f} | {top_words(S[topic_id])}") |
|
|
|
|
| def main(): |
| try: |
| G = nx.read_graphml("aclbib.graphml") |
| except Exception as e: |
| print(f"Error loading graph file: {e}") |
| return |
|
|
| LCC_nodes = max(nx.connected_components(G), key=len) |
| LCC = G.subgraph(LCC_nodes).copy() |
| print( |
| f"Network loaded. LCC contains {len(LCC.nodes())} nodes and " |
| f"{len(LCC.edges())} edges." |
| ) |
|
|
| weaktie_analysis(LCC) |
| centrality_analysis(LCC) |
| research_evolution_analysis(G) |
|
|
|
|
| if __name__ == "__main__": |
| main() |