import pandas as pd import networkx as nx import matplotlib.pyplot as plt import numpy as np import igraph as ig import plotly.subplots as sp data = pd.read_excel("all_comments.xlsx") def analyze_comments(data): # Reset the graph G = nx.DiGraph() # Add nodes to the graph representing authors for author in data['author'].unique(): G.add_node(author) # Add edges to the graph representing replies for _, row in data.dropna(subset=['linkage']).iterrows(): # Find the author of the main comment (the comment being replied to) main_comment_authors = data[data['comment_id'] == row['linkage']]['author'].values if main_comment_authors: main_comment_author = main_comment_authors[0] G.add_edge(row['author'], main_comment_author) # Calculate centrality measures again degree_centrality = nx.degree_centrality(G) in_degree_centrality = nx.in_degree_centrality(G) out_degree_centrality = nx.out_degree_centrality(G) betweenness_centrality = nx.betweenness_centrality(G) closeness_centrality = nx.closeness_centrality(G) # Create a DataFrame to display the results centrality_df = pd.DataFrame({ 'Author': list(degree_centrality.keys()), 'Degree Centrality': list(degree_centrality.values()), 'In-Degree Centrality': list(in_degree_centrality.values()), 'Out-Degree Centrality': list(out_degree_centrality.values()), 'Betweenness Centrality': list(betweenness_centrality.values()), 'Closeness Centrality': list(closeness_centrality.values()) }).sort_values(by='Degree Centrality', ascending=False) print(centrality_df.head(10)) centrality_df.head(10).to_excel("centrality.xlsx", index=False) # Select the top N authors based on degree centrality for the subgraph N = 50 top_authors = [author for author, _ in sorted(degree_centrality.items(), key=lambda item: item[1], reverse=True)[:N]] # Extract the subgraph subgraph = G.subgraph(top_authors) # Draw the subgraph fig_subgraph = plt.figure(figsize=(12, 12)) pos = nx.spring_layout(subgraph) nx.draw_networkx(subgraph, pos, with_labels=True, node_size=500, node_color='skyblue', font_size=10, alpha=0.6, edge_color='gray') plt.title("Subgraph of Top 50 Authors based on Degree Centrality") plt.close(fig_subgraph) # Sample a subset of nodes for the subgraph sample_size = 500 sampled_nodes = list(G.nodes())[:sample_size] # Extract the subgraph for the sampled nodes sampled_subgraph = G.subgraph(sampled_nodes) # Use the Girvan-Newman algorithm on the sampled subgraph sampled_communities_gn = nx.community.girvan_newman(sampled_subgraph) # Get the first partitioning of communities for the sampled subgraph sampled_first_partition = next(sampled_communities_gn) # Convert the first_partition into a more readable format sampled_community_list_gn = [list(community) for community in sampled_first_partition] # Display the number of detected communities and the size of each community for the sampled subgraph sampled_community_sizes_gn = {f"Sampled Community GN {i + 1}": len(community) for i, community in enumerate(sampled_community_list_gn)} no_of_communities = len(sampled_community_sizes_gn) # Generate a new position layout for the nodes in the sampled subgraph sampled_pos = nx.spring_layout(sampled_subgraph) # Helper function to get edges for a community def get_edges(G, community): return [(u, v) for u, v in G.edges() if u in community and v in community] # Visualize the communities in the sampled subgraph fig_communities = plt.figure(figsize=(15, 15)) # Get unique colors for each community colors = plt.cm.rainbow(np.linspace(0, 1, len(sampled_community_list_gn))) # Draw nodes and edges with community colors for community, color in zip(sampled_community_list_gn, colors): nx.draw_networkx_nodes(sampled_subgraph, sampled_pos, nodelist=community, node_color=[color] * len(community), node_size=500) nx.draw_networkx_edges(sampled_subgraph, sampled_pos, edgelist=get_edges(sampled_subgraph, community), alpha=0.5) # Draw labels for nodes nx.draw_networkx_labels(sampled_subgraph, sampled_pos, font_size=10, font_weight="bold") plt.title("Communities in Sampled Subgraph") plt.axis("off") plt.close(fig_communities) return centrality_df, fig_subgraph, fig_communities, no_of_communities # analyze_comments(data)