|
|
import pandas as pd
|
|
|
import networkx as nx
|
|
|
import matplotlib.pyplot as plt
|
|
|
import numpy as np
|
|
|
import igraph as ig
|
|
|
import plotly.subplots as sp
|
|
|
|
|
|
data = pd.read_excel("all_comments.xlsx")
|
|
|
|
|
|
|
|
|
def analyze_comments(data):
|
|
|
|
|
|
G = nx.DiGraph()
|
|
|
|
|
|
|
|
|
for author in data['author'].unique():
|
|
|
G.add_node(author)
|
|
|
|
|
|
|
|
|
for _, row in data.dropna(subset=['linkage']).iterrows():
|
|
|
|
|
|
main_comment_authors = data[data['comment_id'] == row['linkage']]['author'].values
|
|
|
if main_comment_authors:
|
|
|
main_comment_author = main_comment_authors[0]
|
|
|
G.add_edge(row['author'], main_comment_author)
|
|
|
|
|
|
|
|
|
degree_centrality = nx.degree_centrality(G)
|
|
|
in_degree_centrality = nx.in_degree_centrality(G)
|
|
|
out_degree_centrality = nx.out_degree_centrality(G)
|
|
|
betweenness_centrality = nx.betweenness_centrality(G)
|
|
|
closeness_centrality = nx.closeness_centrality(G)
|
|
|
|
|
|
|
|
|
centrality_df = pd.DataFrame({
|
|
|
'Author': list(degree_centrality.keys()),
|
|
|
'Degree Centrality': list(degree_centrality.values()),
|
|
|
'In-Degree Centrality': list(in_degree_centrality.values()),
|
|
|
'Out-Degree Centrality': list(out_degree_centrality.values()),
|
|
|
'Betweenness Centrality': list(betweenness_centrality.values()),
|
|
|
'Closeness Centrality': list(closeness_centrality.values())
|
|
|
}).sort_values(by='Degree Centrality', ascending=False)
|
|
|
|
|
|
print(centrality_df.head(10))
|
|
|
|
|
|
centrality_df.head(10).to_excel("centrality.xlsx", index=False)
|
|
|
|
|
|
|
|
|
N = 50
|
|
|
top_authors = [author for author, _ in
|
|
|
sorted(degree_centrality.items(), key=lambda item: item[1], reverse=True)[:N]]
|
|
|
|
|
|
|
|
|
subgraph = G.subgraph(top_authors)
|
|
|
|
|
|
|
|
|
fig_subgraph = plt.figure(figsize=(12, 12))
|
|
|
pos = nx.spring_layout(subgraph)
|
|
|
nx.draw_networkx(subgraph, pos, with_labels=True, node_size=500, node_color='skyblue', font_size=10, alpha=0.6,
|
|
|
edge_color='gray')
|
|
|
|
|
|
plt.title("Subgraph of Top 50 Authors based on Degree Centrality")
|
|
|
plt.close(fig_subgraph)
|
|
|
|
|
|
|
|
|
sample_size = 500
|
|
|
sampled_nodes = list(G.nodes())[:sample_size]
|
|
|
|
|
|
|
|
|
sampled_subgraph = G.subgraph(sampled_nodes)
|
|
|
|
|
|
|
|
|
sampled_communities_gn = nx.community.girvan_newman(sampled_subgraph)
|
|
|
|
|
|
|
|
|
sampled_first_partition = next(sampled_communities_gn)
|
|
|
|
|
|
|
|
|
sampled_community_list_gn = [list(community) for community in sampled_first_partition]
|
|
|
|
|
|
|
|
|
sampled_community_sizes_gn = {f"Sampled Community GN {i + 1}": len(community) for i, community in
|
|
|
enumerate(sampled_community_list_gn)}
|
|
|
no_of_communities = len(sampled_community_sizes_gn)
|
|
|
|
|
|
|
|
|
sampled_pos = nx.spring_layout(sampled_subgraph)
|
|
|
|
|
|
|
|
|
def get_edges(G, community):
|
|
|
return [(u, v) for u, v in G.edges() if u in community and v in community]
|
|
|
|
|
|
|
|
|
fig_communities = plt.figure(figsize=(15, 15))
|
|
|
|
|
|
|
|
|
colors = plt.cm.rainbow(np.linspace(0, 1, len(sampled_community_list_gn)))
|
|
|
|
|
|
|
|
|
for community, color in zip(sampled_community_list_gn, colors):
|
|
|
nx.draw_networkx_nodes(sampled_subgraph, sampled_pos, nodelist=community, node_color=[color] * len(community),
|
|
|
node_size=500)
|
|
|
nx.draw_networkx_edges(sampled_subgraph, sampled_pos, edgelist=get_edges(sampled_subgraph, community),
|
|
|
alpha=0.5)
|
|
|
|
|
|
|
|
|
nx.draw_networkx_labels(sampled_subgraph, sampled_pos, font_size=10, font_weight="bold")
|
|
|
|
|
|
plt.title("Communities in Sampled Subgraph")
|
|
|
plt.axis("off")
|
|
|
plt.close(fig_communities)
|
|
|
|
|
|
return centrality_df, fig_subgraph, fig_communities, no_of_communities
|
|
|
|
|
|
|
|
|
|