Youtube-analytics-dashboard / analyze_comments.py
Rahul-Sainy's picture
Upload 5 files
422e54a verified
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import igraph as ig
import plotly.subplots as sp
data = pd.read_excel("all_comments.xlsx")
def analyze_comments(data):
# Reset the graph
G = nx.DiGraph()
# Add nodes to the graph representing authors
for author in data['author'].unique():
G.add_node(author)
# Add edges to the graph representing replies
for _, row in data.dropna(subset=['linkage']).iterrows():
# Find the author of the main comment (the comment being replied to)
main_comment_authors = data[data['comment_id'] == row['linkage']]['author'].values
if main_comment_authors:
main_comment_author = main_comment_authors[0]
G.add_edge(row['author'], main_comment_author)
# Calculate centrality measures again
degree_centrality = nx.degree_centrality(G)
in_degree_centrality = nx.in_degree_centrality(G)
out_degree_centrality = nx.out_degree_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)
closeness_centrality = nx.closeness_centrality(G)
# Create a DataFrame to display the results
centrality_df = pd.DataFrame({
'Author': list(degree_centrality.keys()),
'Degree Centrality': list(degree_centrality.values()),
'In-Degree Centrality': list(in_degree_centrality.values()),
'Out-Degree Centrality': list(out_degree_centrality.values()),
'Betweenness Centrality': list(betweenness_centrality.values()),
'Closeness Centrality': list(closeness_centrality.values())
}).sort_values(by='Degree Centrality', ascending=False)
print(centrality_df.head(10))
centrality_df.head(10).to_excel("centrality.xlsx", index=False)
# Select the top N authors based on degree centrality for the subgraph
N = 50
top_authors = [author for author, _ in
sorted(degree_centrality.items(), key=lambda item: item[1], reverse=True)[:N]]
# Extract the subgraph
subgraph = G.subgraph(top_authors)
# Draw the subgraph
fig_subgraph = plt.figure(figsize=(12, 12))
pos = nx.spring_layout(subgraph)
nx.draw_networkx(subgraph, pos, with_labels=True, node_size=500, node_color='skyblue', font_size=10, alpha=0.6,
edge_color='gray')
plt.title("Subgraph of Top 50 Authors based on Degree Centrality")
plt.close(fig_subgraph)
# Sample a subset of nodes for the subgraph
sample_size = 500
sampled_nodes = list(G.nodes())[:sample_size]
# Extract the subgraph for the sampled nodes
sampled_subgraph = G.subgraph(sampled_nodes)
# Use the Girvan-Newman algorithm on the sampled subgraph
sampled_communities_gn = nx.community.girvan_newman(sampled_subgraph)
# Get the first partitioning of communities for the sampled subgraph
sampled_first_partition = next(sampled_communities_gn)
# Convert the first_partition into a more readable format
sampled_community_list_gn = [list(community) for community in sampled_first_partition]
# Display the number of detected communities and the size of each community for the sampled subgraph
sampled_community_sizes_gn = {f"Sampled Community GN {i + 1}": len(community) for i, community in
enumerate(sampled_community_list_gn)}
no_of_communities = len(sampled_community_sizes_gn)
# Generate a new position layout for the nodes in the sampled subgraph
sampled_pos = nx.spring_layout(sampled_subgraph)
# Helper function to get edges for a community
def get_edges(G, community):
return [(u, v) for u, v in G.edges() if u in community and v in community]
# Visualize the communities in the sampled subgraph
fig_communities = plt.figure(figsize=(15, 15))
# Get unique colors for each community
colors = plt.cm.rainbow(np.linspace(0, 1, len(sampled_community_list_gn)))
# Draw nodes and edges with community colors
for community, color in zip(sampled_community_list_gn, colors):
nx.draw_networkx_nodes(sampled_subgraph, sampled_pos, nodelist=community, node_color=[color] * len(community),
node_size=500)
nx.draw_networkx_edges(sampled_subgraph, sampled_pos, edgelist=get_edges(sampled_subgraph, community),
alpha=0.5)
# Draw labels for nodes
nx.draw_networkx_labels(sampled_subgraph, sampled_pos, font_size=10, font_weight="bold")
plt.title("Communities in Sampled Subgraph")
plt.axis("off")
plt.close(fig_communities)
return centrality_df, fig_subgraph, fig_communities, no_of_communities
# analyze_comments(data)