| | |
| | |
| |
|
| | import networkx as nx |
| | import pandas as pd |
| | from scipy.stats import pearsonr |
| | import numpy as np |
| | import matplotlib.pyplot as plt |
| |
|
| | |
| | from gensim.models.ldamodel import LdaModel |
| | from gensim.corpora.dictionary import Dictionary |
| | import nltk |
| | from nltk.tokenize import word_tokenize |
| |
|
| |
|
| | |
| | try: |
| | nltk.data.find("tokenizers/punkt") |
| | except LookupError: |
| | nltk.download("punkt", quiet=True) |
| |
|
| | try: |
| | nltk.data.find("tokenizers/punkt_tab") |
| | except LookupError: |
| | |
| | nltk.download("punkt_tab", quiet=True) |
| |
|
| |
|
| | def _safe_int_year(value): |
| | try: |
| | return int(value) |
| | except (TypeError, ValueError): |
| | return 0 |
| |
|
| |
|
| | def _rank_vector(scores, node_order): |
| | """ |
| | Convert centrality scores to rank vectors (1 = highest centrality), |
| | which matches the assignment requirement to correlate rankings. |
| | """ |
| | series = pd.Series({node: scores[node] for node in node_order}) |
| | ranks = series.rank(method="average", ascending=False) |
| | return [float(ranks[node]) for node in node_order] |
| |
|
| |
|
| | def _tokenize(text): |
| | tokens = word_tokenize(text.lower()) |
| | return [tok for tok in tokens if tok.isalpha() and len(tok) > 2] |
| |
|
| |
|
| | |
| | def weaktie_analysis(LCC): |
| | print("\n--- Starting Weak/Strong Tie Analysis ---") |
| | edges_asc = sorted( |
| | LCC.edges(data=True), key=lambda x: float(x[2].get("weight", 0.0)) |
| | ) |
| | edges_desc = list(reversed(edges_asc)) |
| | edge_weights = [float(data.get("weight", 0.0)) for _, _, data in edges_asc] |
| | total_edges = len(edge_weights) |
| |
|
| | if total_edges == 0: |
| | print("No ties found in the LCC; skipping weak/strong tie removal analysis.") |
| | return |
| |
|
| | |
| | |
| | median_weight = float(np.median(edge_weights)) |
| | weak_ties = [(u, v, d) for u, v, d in edges_asc if float(d.get("weight", 0.0)) <= median_weight] |
| | strong_ties = [(u, v, d) for u, v, d in edges_asc if float(d.get("weight", 0.0)) > median_weight] |
| |
|
| | print(f"Total ties in LCC: {total_edges}") |
| | print(f"Weak tie threshold (median weight): {median_weight:.4f}") |
| | print(f"Number of weak ties (weight <= {median_weight:.4f}): {len(weak_ties)}") |
| | print(f"Number of strong ties (weight > {median_weight:.4f}): {len(strong_ties)}") |
| | print("Methodology: remove one tie per step and recompute LCC size after each removal.") |
| |
|
| | def get_lcc_sizes_by_single_removal(edge_list): |
| | temp_graph = LCC.copy() |
| | total_edges = len(edge_list) |
| | fractions_removed = [0.0] |
| | lcc_sizes = [len(max(nx.connected_components(temp_graph), key=len))] |
| |
|
| | for idx, (u, v, _) in enumerate(edge_list, start=1): |
| | if temp_graph.has_edge(u, v): |
| | temp_graph.remove_edge(u, v) |
| |
|
| | if temp_graph.number_of_nodes() > 0: |
| | current_lcc = max(nx.connected_components(temp_graph), key=len) |
| | lcc_sizes.append(len(current_lcc)) |
| | else: |
| | lcc_sizes.append(0) |
| | fractions_removed.append(idx / total_edges) |
| |
|
| | return fractions_removed, lcc_sizes |
| |
|
| | x_weak, y_weak = get_lcc_sizes_by_single_removal(edges_asc) |
| | x_strong, y_strong = get_lcc_sizes_by_single_removal(edges_desc) |
| |
|
| | plt.figure(figsize=(10, 6)) |
| | plt.plot(x_weak, y_weak, label="Removing Weakest First") |
| | plt.plot(x_strong, y_strong, label="Removing Strongest First") |
| | plt.xlabel("Fraction of Ties Removed") |
| | plt.ylabel("LCC Size (Number of Nodes)") |
| | plt.title("Impact of Weak vs Strong Tie Removal on LCC") |
| | plt.legend() |
| | plt.grid(True, linestyle="--", alpha=0.7) |
| | plt.tight_layout() |
| | plt.show() |
| |
|
| |
|
| | |
| | def centrality_analysis(LCC): |
| | print("\n--- Starting Centrality Analysis ---") |
| |
|
| | degree = nx.degree_centrality(LCC) |
| | closeness = nx.closeness_centrality(LCC) |
| | betweenness = nx.betweenness_centrality(LCC) |
| |
|
| | nodes = list(LCC.nodes()) |
| | d_rank = _rank_vector(degree, nodes) |
| | c_rank = _rank_vector(closeness, nodes) |
| | b_rank = _rank_vector(betweenness, nodes) |
| |
|
| | corr_dc, _ = pearsonr(d_rank, c_rank) |
| | corr_db, _ = pearsonr(d_rank, b_rank) |
| | corr_cb, _ = pearsonr(c_rank, b_rank) |
| |
|
| | print("\nTable 1: Pearson Correlation between Centrality Measure Rankings") |
| | table = pd.DataFrame( |
| | { |
| | "Metric": ["Degree", "Closeness", "Betweenness"], |
| | "Degree": [1.0, corr_dc, corr_db], |
| | "Closeness": [corr_dc, 1.0, corr_cb], |
| | "Betweenness": [corr_db, corr_cb, 1.0], |
| | } |
| | ) |
| | print(table.to_string(index=False, float_format=lambda x: f"{x:.4f}")) |
| |
|
| | pair_corr = { |
| | ("Degree", "Closeness"): corr_dc, |
| | ("Degree", "Betweenness"): corr_db, |
| | ("Closeness", "Betweenness"): corr_cb, |
| | } |
| | lowest_pair, lowest_value = min(pair_corr.items(), key=lambda x: x[1]) |
| | highest_pair, highest_value = max(pair_corr.items(), key=lambda x: x[1]) |
| | print( |
| | f"\nLowest-correlation pair: {lowest_pair[0]} vs {lowest_pair[1]} " |
| | f"(r = {lowest_value:.4f})" |
| | ) |
| | print( |
| | f"Highest-correlation pair: {highest_pair[0]} vs {highest_pair[1]} " |
| | f"(r = {highest_value:.4f})" |
| | ) |
| |
|
| | explanations = { |
| | frozenset(("Degree", "Closeness")): ( |
| | "Degree is local (immediate neighbors), while closeness captures " |
| | "global shortest-path proximity to all nodes." |
| | ), |
| | frozenset(("Degree", "Betweenness")): ( |
| | "High degree does not always imply bridge-like behavior; betweenness " |
| | "emphasizes control over shortest paths across communities." |
| | ), |
| | frozenset(("Closeness", "Betweenness")): ( |
| | "Closeness rewards overall proximity, while betweenness rewards " |
| | "being on critical routes between other nodes." |
| | ), |
| | } |
| | print(f"Interpretation: {explanations[frozenset(lowest_pair)]}") |
| | print( |
| | "Correlation quality note: values closer to 1 indicate stronger agreement " |
| | "between ranking-based notions of node importance." |
| | ) |
| |
|
| | metrics = {"Degree": degree, "Closeness": closeness, "Betweenness": betweenness} |
| | top_nodes_by_metric = {} |
| | for metric_name, score_map in metrics.items(): |
| | print(f"\nTop 10 Papers for {metric_name} (ID<TAB>Title<TAB>Score):") |
| | top_10 = sorted(score_map.items(), key=lambda x: x[1], reverse=True)[:10] |
| | top_nodes_by_metric[metric_name] = [node_id for node_id, _ in top_10] |
| | for node_id, _ in top_10: |
| | title = LCC.nodes[node_id].get("title", "Unknown Title") |
| | print(f"{node_id}\t{title}\t{score_map[node_id]:.6f}") |
| |
|
| | |
| | top_presence = {} |
| | for metric_name, node_ids in top_nodes_by_metric.items(): |
| | for node_id in node_ids: |
| | if node_id not in top_presence: |
| | top_presence[node_id] = [] |
| | top_presence[node_id].append(metric_name) |
| |
|
| | repeated = [ |
| | (node_id, sorted(metric_names)) |
| | for node_id, metric_names in top_presence.items() |
| | if len(metric_names) >= 2 |
| | ] |
| | repeated.sort(key=lambda x: (-len(x[1]), x[0])) |
| |
|
| | if repeated: |
| | print("\nPapers repeated across multiple centrality top-10 lists:") |
| | for node_id, metric_names in repeated: |
| | title = LCC.nodes[node_id].get("title", "Unknown Title") |
| | print(f"{node_id}\t{title}\tappears in: {', '.join(metric_names)}") |
| | else: |
| | print("\nNo paper appears in more than one top-10 centrality list.") |
| |
|
| |
|
| | |
| | def research_evolution_analysis(G, num_topics=5): |
| | print("\n--- Optional: Research Evolution Analysis ---") |
| |
|
| | before_nodes = [n for n, d in G.nodes(data=True) if _safe_int_year(d.get("year")) < 2023] |
| | after_nodes = [n for n, d in G.nodes(data=True) if _safe_int_year(d.get("year")) >= 2023] |
| |
|
| | before_docs = [] |
| | for n in before_nodes: |
| | title = G.nodes[n].get("title", "") |
| | abstract = G.nodes[n].get("abstract", "") |
| | text = f"{title} {abstract}".strip() |
| | tokens = _tokenize(text) if text else [] |
| | if tokens: |
| | before_docs.append(tokens) |
| |
|
| | after_docs = [] |
| | for n in after_nodes: |
| | title = G.nodes[n].get("title", "") |
| | abstract = G.nodes[n].get("abstract", "") |
| | text = f"{title} {abstract}".strip() |
| | tokens = _tokenize(text) if text else [] |
| | if tokens: |
| | after_docs.append(tokens) |
| |
|
| | if not before_docs or not after_docs: |
| | print("Insufficient tokenized documents before/after 2023 for topic comparison.") |
| | return |
| |
|
| | |
| | dictionary = Dictionary(before_docs + after_docs) |
| | dictionary.filter_extremes(no_below=2, no_above=0.5, keep_n=5000) |
| | if len(dictionary) == 0: |
| | print("Vocabulary became empty after filtering; skipping extra credit analysis.") |
| | return |
| |
|
| | before_corpus = [dictionary.doc2bow(doc) for doc in before_docs] |
| | after_corpus = [dictionary.doc2bow(doc) for doc in after_docs] |
| | before_corpus = [bow for bow in before_corpus if bow] |
| | after_corpus = [bow for bow in after_corpus if bow] |
| |
|
| | if not before_corpus or not after_corpus: |
| | print("Insufficient BOW documents after vocabulary filtering.") |
| | return |
| |
|
| | lda_before = LdaModel( |
| | corpus=before_corpus, id2word=dictionary, num_topics=num_topics, passes=10, random_state=42 |
| | ) |
| | lda_after = LdaModel( |
| | corpus=after_corpus, id2word=dictionary, num_topics=num_topics, passes=10, random_state=42 |
| | ) |
| |
|
| | |
| | D = lda_before.get_topics() |
| | S = lda_after.get_topics() |
| | print(f"D matrix shape (before): {D.shape}") |
| | print(f"S matrix shape (after): {S.shape}") |
| |
|
| | def cosine_similarity(a, b): |
| | denom = np.linalg.norm(a) * np.linalg.norm(b) |
| | if denom == 0: |
| | return 0.0 |
| | return float(np.dot(a, b) / denom) |
| |
|
| | before_shift = [] |
| | for i in range(D.shape[0]): |
| | sims = [cosine_similarity(D[i], S[j]) for j in range(S.shape[0])] |
| | before_shift.append((i, 1.0 - max(sims) if sims else 1.0)) |
| |
|
| | after_shift = [] |
| | for j in range(S.shape[0]): |
| | sims = [cosine_similarity(S[j], D[i]) for i in range(D.shape[0])] |
| | after_shift.append((j, 1.0 - max(sims) if sims else 1.0)) |
| |
|
| | before_shift.sort(key=lambda x: x[1], reverse=True) |
| | after_shift.sort(key=lambda x: x[1], reverse=True) |
| |
|
| | def top_words(topic_vec, topn=8): |
| | idx = np.argsort(topic_vec)[::-1][:topn] |
| | return ", ".join(dictionary[i] for i in idx) |
| |
|
| | print("\nPotentially disappearing themes (before topics with largest shift):") |
| | for topic_id, shift_score in before_shift: |
| | print(f"Before Topic {topic_id} | shift={shift_score:.4f} | {top_words(D[topic_id])}") |
| |
|
| | print("\nPotentially emerging themes (after topics with largest shift):") |
| | for topic_id, shift_score in after_shift: |
| | print(f"After Topic {topic_id} | shift={shift_score:.4f} | {top_words(S[topic_id])}") |
| |
|
| |
|
| | def main(): |
| | try: |
| | G = nx.read_graphml("aclbib.graphml") |
| | except Exception as e: |
| | print(f"Error loading graph file: {e}") |
| | return |
| |
|
| | LCC_nodes = max(nx.connected_components(G), key=len) |
| | LCC = G.subgraph(LCC_nodes).copy() |
| | print( |
| | f"Network loaded. LCC contains {len(LCC.nodes())} nodes and " |
| | f"{len(LCC.edges())} edges." |
| | ) |
| |
|
| | weaktie_analysis(LCC) |
| | centrality_analysis(LCC) |
| | research_evolution_analysis(G) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |