File size: 11,821 Bytes

1db7196

# Author: Md. Shahidul Salim
# Date: February 12, 2026

import networkx as nx
import pandas as pd
from scipy.stats import pearsonr
import numpy as np
import matplotlib.pyplot as plt

# Extra credit imports
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
import nltk
from nltk.tokenize import word_tokenize


# Ensure NLTK resources are available
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt", quiet=True)

try:
    nltk.data.find("tokenizers/punkt_tab")
except LookupError:
    # Required by some NLTK versions.
    nltk.download("punkt_tab", quiet=True)


def _safe_int_year(value):
    try:
        return int(value)
    except (TypeError, ValueError):
        return 0


def _rank_vector(scores, node_order):
    """
    Convert centrality scores to rank vectors (1 = highest centrality),
    which matches the assignment requirement to correlate rankings.
    """
    series = pd.Series({node: scores[node] for node in node_order})
    ranks = series.rank(method="average", ascending=False)
    return [float(ranks[node]) for node in node_order]


def _tokenize(text):
    tokens = word_tokenize(text.lower())
    return [tok for tok in tokens if tok.isalpha() and len(tok) > 2]


# Part 1: Weak Tie Analysis
def weaktie_analysis(LCC):
    print("\n--- Starting Weak/Strong Tie Analysis ---")
    edges_asc = sorted(
        LCC.edges(data=True), key=lambda x: float(x[2].get("weight", 0.0))
    )
    edges_desc = list(reversed(edges_asc))
    edge_weights = [float(data.get("weight", 0.0)) for _, _, data in edges_asc]
    total_edges = len(edge_weights)

    if total_edges == 0:
        print("No ties found in the LCC; skipping weak/strong tie removal analysis.")
        return

    # Use median edge weight as the cutoff:
    # weak ties: weight <= median, strong ties: weight > median.
    median_weight = float(np.median(edge_weights))
    weak_ties = [(u, v, d) for u, v, d in edges_asc if float(d.get("weight", 0.0)) <= median_weight]
    strong_ties = [(u, v, d) for u, v, d in edges_asc if float(d.get("weight", 0.0)) > median_weight]

    print(f"Total ties in LCC: {total_edges}")
    print(f"Weak tie threshold (median weight): {median_weight:.4f}")
    print(f"Number of weak ties (weight <= {median_weight:.4f}): {len(weak_ties)}")
    print(f"Number of strong ties (weight > {median_weight:.4f}): {len(strong_ties)}")
    print("Methodology: remove one tie per step and recompute LCC size after each removal.")

    def get_lcc_sizes_by_single_removal(edge_list):
        temp_graph = LCC.copy()
        total_edges = len(edge_list)
        fractions_removed = [0.0]
        lcc_sizes = [len(max(nx.connected_components(temp_graph), key=len))]

        for idx, (u, v, _) in enumerate(edge_list, start=1):
            if temp_graph.has_edge(u, v):
                temp_graph.remove_edge(u, v)

            if temp_graph.number_of_nodes() > 0:
                current_lcc = max(nx.connected_components(temp_graph), key=len)
                lcc_sizes.append(len(current_lcc))
            else:
                lcc_sizes.append(0)
            fractions_removed.append(idx / total_edges)

        return fractions_removed, lcc_sizes

    x_weak, y_weak = get_lcc_sizes_by_single_removal(edges_asc)
    x_strong, y_strong = get_lcc_sizes_by_single_removal(edges_desc)

    plt.figure(figsize=(10, 6))
    plt.plot(x_weak, y_weak, label="Removing Weakest First")
    plt.plot(x_strong, y_strong, label="Removing Strongest First")
    plt.xlabel("Fraction of Ties Removed")
    plt.ylabel("LCC Size (Number of Nodes)")
    plt.title("Impact of Weak vs Strong Tie Removal on LCC")
    plt.legend()
    plt.grid(True, linestyle="--", alpha=0.7)
    plt.tight_layout()
    plt.show()


# Part 2: Centrality Analysis
def centrality_analysis(LCC):
    print("\n--- Starting Centrality Analysis ---")

    degree = nx.degree_centrality(LCC)
    closeness = nx.closeness_centrality(LCC)
    betweenness = nx.betweenness_centrality(LCC)

    nodes = list(LCC.nodes())
    d_rank = _rank_vector(degree, nodes)
    c_rank = _rank_vector(closeness, nodes)
    b_rank = _rank_vector(betweenness, nodes)

    corr_dc, _ = pearsonr(d_rank, c_rank)
    corr_db, _ = pearsonr(d_rank, b_rank)
    corr_cb, _ = pearsonr(c_rank, b_rank)

    print("\nTable 1: Pearson Correlation between Centrality Measure Rankings")
    table = pd.DataFrame(
        {
            "Metric": ["Degree", "Closeness", "Betweenness"],
            "Degree": [1.0, corr_dc, corr_db],
            "Closeness": [corr_dc, 1.0, corr_cb],
            "Betweenness": [corr_db, corr_cb, 1.0],
        }
    )
    print(table.to_string(index=False, float_format=lambda x: f"{x:.4f}"))

    pair_corr = {
        ("Degree", "Closeness"): corr_dc,
        ("Degree", "Betweenness"): corr_db,
        ("Closeness", "Betweenness"): corr_cb,
    }
    lowest_pair, lowest_value = min(pair_corr.items(), key=lambda x: x[1])
    highest_pair, highest_value = max(pair_corr.items(), key=lambda x: x[1])
    print(
        f"\nLowest-correlation pair: {lowest_pair[0]} vs {lowest_pair[1]} "
        f"(r = {lowest_value:.4f})"
    )
    print(
        f"Highest-correlation pair: {highest_pair[0]} vs {highest_pair[1]} "
        f"(r = {highest_value:.4f})"
    )

    explanations = {
        frozenset(("Degree", "Closeness")): (
            "Degree is local (immediate neighbors), while closeness captures "
            "global shortest-path proximity to all nodes."
        ),
        frozenset(("Degree", "Betweenness")): (
            "High degree does not always imply bridge-like behavior; betweenness "
            "emphasizes control over shortest paths across communities."
        ),
        frozenset(("Closeness", "Betweenness")): (
            "Closeness rewards overall proximity, while betweenness rewards "
            "being on critical routes between other nodes."
        ),
    }
    print(f"Interpretation: {explanations[frozenset(lowest_pair)]}")
    print(
        "Correlation quality note: values closer to 1 indicate stronger agreement "
        "between ranking-based notions of node importance."
    )

    metrics = {"Degree": degree, "Closeness": closeness, "Betweenness": betweenness}
    top_nodes_by_metric = {}
    for metric_name, score_map in metrics.items():
        print(f"\nTop 10 Papers for {metric_name} (ID<TAB>Title<TAB>Score):")
        top_10 = sorted(score_map.items(), key=lambda x: x[1], reverse=True)[:10]
        top_nodes_by_metric[metric_name] = [node_id for node_id, _ in top_10]
        for node_id, _ in top_10:
            title = LCC.nodes[node_id].get("title", "Unknown Title")
            print(f"{node_id}\t{title}\t{score_map[node_id]:.6f}")

    # Identify papers that appear in multiple top-10 lists (robust centrality evidence).
    top_presence = {}
    for metric_name, node_ids in top_nodes_by_metric.items():
        for node_id in node_ids:
            if node_id not in top_presence:
                top_presence[node_id] = []
            top_presence[node_id].append(metric_name)

    repeated = [
        (node_id, sorted(metric_names))
        for node_id, metric_names in top_presence.items()
        if len(metric_names) >= 2
    ]
    repeated.sort(key=lambda x: (-len(x[1]), x[0]))

    if repeated:
        print("\nPapers repeated across multiple centrality top-10 lists:")
        for node_id, metric_names in repeated:
            title = LCC.nodes[node_id].get("title", "Unknown Title")
            print(f"{node_id}\t{title}\tappears in: {', '.join(metric_names)}")
    else:
        print("\nNo paper appears in more than one top-10 centrality list.")


# Part 3: Research Evolution (Optional Extra Credit)
def research_evolution_analysis(G, num_topics=5):
    print("\n--- Optional: Research Evolution Analysis ---")

    before_nodes = [n for n, d in G.nodes(data=True) if _safe_int_year(d.get("year")) < 2023]
    after_nodes = [n for n, d in G.nodes(data=True) if _safe_int_year(d.get("year")) >= 2023]

    before_docs = []
    for n in before_nodes:
        title = G.nodes[n].get("title", "")
        abstract = G.nodes[n].get("abstract", "")
        text = f"{title} {abstract}".strip()
        tokens = _tokenize(text) if text else []
        if tokens:
            before_docs.append(tokens)

    after_docs = []
    for n in after_nodes:
        title = G.nodes[n].get("title", "")
        abstract = G.nodes[n].get("abstract", "")
        text = f"{title} {abstract}".strip()
        tokens = _tokenize(text) if text else []
        if tokens:
            after_docs.append(tokens)

    if not before_docs or not after_docs:
        print("Insufficient tokenized documents before/after 2023 for topic comparison.")
        return

    # Shared dictionary gives a single global vocabulary n for both matrices.
    dictionary = Dictionary(before_docs + after_docs)
    dictionary.filter_extremes(no_below=2, no_above=0.5, keep_n=5000)
    if len(dictionary) == 0:
        print("Vocabulary became empty after filtering; skipping extra credit analysis.")
        return

    before_corpus = [dictionary.doc2bow(doc) for doc in before_docs]
    after_corpus = [dictionary.doc2bow(doc) for doc in after_docs]
    before_corpus = [bow for bow in before_corpus if bow]
    after_corpus = [bow for bow in after_corpus if bow]

    if not before_corpus or not after_corpus:
        print("Insufficient BOW documents after vocabulary filtering.")
        return

    lda_before = LdaModel(
        corpus=before_corpus, id2word=dictionary, num_topics=num_topics, passes=10, random_state=42
    )
    lda_after = LdaModel(
        corpus=after_corpus, id2word=dictionary, num_topics=num_topics, passes=10, random_state=42
    )

    # D and S correspond to topic-term probability matrices with shared vocabulary.
    D = lda_before.get_topics()  # shape: (k1, n)
    S = lda_after.get_topics()   # shape: (k2, n)
    print(f"D matrix shape (before): {D.shape}")
    print(f"S matrix shape (after): {S.shape}")

    def cosine_similarity(a, b):
        denom = np.linalg.norm(a) * np.linalg.norm(b)
        if denom == 0:
            return 0.0
        return float(np.dot(a, b) / denom)

    before_shift = []
    for i in range(D.shape[0]):
        sims = [cosine_similarity(D[i], S[j]) for j in range(S.shape[0])]
        before_shift.append((i, 1.0 - max(sims) if sims else 1.0))

    after_shift = []
    for j in range(S.shape[0]):
        sims = [cosine_similarity(S[j], D[i]) for i in range(D.shape[0])]
        after_shift.append((j, 1.0 - max(sims) if sims else 1.0))

    before_shift.sort(key=lambda x: x[1], reverse=True)
    after_shift.sort(key=lambda x: x[1], reverse=True)

    def top_words(topic_vec, topn=8):
        idx = np.argsort(topic_vec)[::-1][:topn]
        return ", ".join(dictionary[i] for i in idx)

    print("\nPotentially disappearing themes (before topics with largest shift):")
    for topic_id, shift_score in before_shift:
        print(f"Before Topic {topic_id} | shift={shift_score:.4f} | {top_words(D[topic_id])}")

    print("\nPotentially emerging themes (after topics with largest shift):")
    for topic_id, shift_score in after_shift:
        print(f"After Topic {topic_id} | shift={shift_score:.4f} | {top_words(S[topic_id])}")


def main():
    try:
        G = nx.read_graphml("aclbib.graphml")
    except Exception as e:
        print(f"Error loading graph file: {e}")
        return

    LCC_nodes = max(nx.connected_components(G), key=len)
    LCC = G.subgraph(LCC_nodes).copy()
    print(
        f"Network loaded. LCC contains {len(LCC.nodes())} nodes and "
        f"{len(LCC.edges())} edges."
    )

    weaktie_analysis(LCC)
    centrality_analysis(LCC)
    research_evolution_analysis(G)


if __name__ == "__main__":
    main()