sna / app.py
insikt's picture
update app
2c4d5ab
import tempfile
import streamlit as st
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import io
import base64
import json
from matplotlib import pylab # type: ignore
from PIL import Image
# from SNA import constructEdgeListFromDict, save_graph, getNodesCommunity, getGraphCommunities, getNodesPageRank, getNodesCentralityBetwenness,getNodesIndegreeOutdegree, socialNetworkAnalysisMetrics
# Add your existing functions here
def get_table_download_link(df, file_name, file_description):
csv = df.to_csv(index=False)
b64 = base64.b64encode(csv.encode()).decode()
href = f'<a href="data:file/csv;base64,{b64}" download="{file_name}">{file_description}</a>'
return href
st.title("Social Network Analysis")
#-------------------------------------------
#These are the functions from your colab
def constructEdgeListFromDict(messagesDF):
# Subset to only have the needed data
messagesDF = messagesDF[['user', 'replytoauthor']]
# Remove unexisting edges (reply to is empty)
edgesDF = messagesDF.dropna(subset=['replytoauthor'])
# Process edges (group_by nickname, replytoauthor -> weight by occurrences)
edgesDFWeight = edgesDF.groupby(['user', 'replytoauthor']).size().reset_index(name='weight')
return(edgesDFWeight)
# def save_graph(graph,file_name):
# #initialze Figure
# plt.figure(num=None, figsize=(20, 20), dpi=80)
# plt.axis('off')
# fig = plt.figure(1)
# pos = nx.spring_layout(graph) # type: ignore
# nx.draw_networkx_nodes(graph,pos)
# nx.draw_networkx_edges(graph,pos)
# nx.draw_networkx_labels(graph,pos)
# cut = 1.00
# xmax = cut * max(xx for xx, yy in pos.values())
# ymax = cut * max(yy for xx, yy in pos.values())
# plt.xlim(0, xmax)
# plt.ylim(0, ymax)
# plt.show()
# plt.savefig(file_name,bbox_inches="tight")
# pylab.close()
# del fig
# @st.cache_data
def save_graph(graph, file_name):
# Initialize Figure
plt.figure(num=None, figsize=(20, 20), dpi=80)
plt.axis('off')
fig = plt.figure(1)
pos = nx.spring_layout(graph) # type: ignore
nx.draw_networkx_nodes(graph, pos) # type: ignore
nx.draw_networkx_edges(graph, pos) # type: ignore
nx.draw_networkx_labels(graph, pos) # type: ignore
cut = 1.00
xmax = cut * max(xx for xx, yy in pos.values())
ymax = cut * max(yy for xx, yy in pos.values())
plt.xlim(0, xmax)
plt.ylim(0, ymax)
# Save the figure to a temporary file and load it as an image
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmpfile:
plt.savefig(tmpfile.name, bbox_inches="tight")
img = Image.open(tmpfile.name)
pylab.close()
del fig
return img
def getNodesCommunity(communities):
vertexComms = {}
for com, com_vertices in enumerate(communities):
for v in list(com_vertices):
# vertexComms.append({'nickname': v, 'community': com})
vertexComms[v] = com
return(vertexComms)
def getGraphCommunities(G, seed = 42):
# If only getting the largest component, the other components.
# would be grouped in a community by themselves.
# Using only for "large" graphs (100k nodes or above)
#if(G.number_of_nodes() >= 100000):
# largestComponent = max(nx.weakly_connected_components(G), key=len)
# G = G.subgraph(largestComponent)
# Algorithm: Label Propagation. Con: Detects way too many comms.
# community_generator = community.asyn_lpa_communities(G, 'weight', seed = seed)
# Algorithm: Modularity Maximization. Does not take into account direction of interactions.
G_un = G.to_undirected() # Graph must be undirected.
community_generator = nx.community.greedy_modularity_communities(G_un, 'weight')
communities = sorted(community_generator, key=len, reverse=True)
nodeComms = getNodesCommunity(communities)
del G_un
return(nodeComms)
def getNodesPageRank(G):
pr = nx.pagerank(G, alpha = 0.85, weight = 'weight')
return(pr)
# result = [{'nickname': k, 'pagerank': pr[k]} for k in pr.keys()]
#return (result)
def getNodesCentralityBetwenness(G):
G_un = G.to_undirected() # Graph must be undirected.
betweenness = nx.betweenness_centrality(G_un, weight = 'weight')
return (betweenness)
#result = [{'nickname': k, 'betweenness': betweenness[k]} for k in betweenness.keys()]
#return (result)
def getNodesIndegreeOutdegree(G):
inDeg = G.in_degree(weight='weight')
outDeg = G.out_degree(weight='weight')
vs = list(G.nodes)
result = {v: inDeg[v]/(outDeg[v] + 1) for v in vs}
return (result)
# @st.cache_data
def socialNetworkAnalysisMetrics(G):
print("[SNA] Starting Social Network Analysis")
print("[SNA] Community detection: Modularity Maximization...")
communities = getGraphCommunities(G)
print('[SNA] Community detection: Modularity Maximization -- Done')
# printInfoAboutCommunities(infomap)
print("[SNA] Centrality Measure: Page Rank...")
pagerank = getNodesPageRank(G)
print("[SNA] Centrality Measure: Page Rank -- Done")
print("[SNA] Centrality Measure: Betweenness...")
betweenness = getNodesCentralityBetwenness(G)
print("[SNA] Centrality Measure: Betweenness -- Done")
print("[SNA] Centrality Measure: Indegree/Outdegree...")
inOut = getNodesIndegreeOutdegree(G)
print("[SNA] Centrality Measure: Indegree/Outdegree -- Done")
# Merge results
print("[SNA] Merging results...")
allResults = []
for v in list(G.nodes):
allResults.append(
{'user': v,
'pagerank': pagerank[str(v)],
'community': communities[str(v)],
'betweenness': betweenness[str(v)],
'indegree_outdegree': inOut[str(v)]
})
print("[SNA] Merging results -- Done")
del pagerank
del communities
del betweenness
del inOut
return(allResults)
#-------------------------------------------
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
if uploaded_file is not None:
df = pd.read_csv(uploaded_file, sep=';', encoding='utf8')
# rename columns to lower case
df.columns= df.columns.str.lower()
# df.columns
st.write(df)
# create edge list
edgelistDF = constructEdgeListFromDict(df)
# create graph
G = nx.from_pandas_edgelist(edgelistDF, source='user', target='replytoauthor', edge_attr='weight', create_using=nx.DiGraph) # type: ignore
st.write("Graph size: ",G.size())
#Assuming that the graph g has nodes and edges entered
with st.spinner('Loading image'):
# img = None
# if img is None:
# # save_graph(G,"my_graph.pdf")
img = save_graph(G, "my_graph.png")
# Download button
if img is not None:
st.write("Here is the graph visualization:")
st.image(img)
with st.spinner('Getting SNA metrics'):
result = socialNetworkAnalysisMetrics(G)
# st.write(result)
# Convert JSON to DataFrame
jsonString = json.dumps(result)
df_result = pd.read_json(jsonString)
st.write(df_result)
#------------------------------
fig, ax = plt.subplots()
ax2 = df_result.plot.scatter(x='pagerank',
y='betweenness',
c='community',
colormap='CMRmap',
ax=ax)
# set the title and labels
ax.set_title('Pagerank vs Betweenness')
ax.set_xlabel('Pagerank')
ax.set_ylabel('Betweenness')
# display the plot using streamlit
st.pyplot(fig)
# Create a function to generate the bar plot
st.write("Top Betweenness: ")
def bar_plot3(df):
sns.set(rc={'figure.figsize':(16, 9)})
g = df.groupby('user', as_index=False)['betweenness'].sum().sort_values(by='betweenness', ascending=False).head(10)
bar_plot_fig = sns.barplot(data=g, x='user', y='betweenness', hue='user', dodge=False)
bar_plot_fig.set(xticklabels=[])
return bar_plot_fig.get_figure()
# Display the bar plot in Streamlit
st.pyplot(bar_plot3(df_result))
# Create a function to generate the pagerank bar plot
def pagerank_bar_plot(df): # type: ignore
sns.set(rc={'figure.figsize':(16, 9)})
g = df.groupby('user', as_index=False)['pagerank'].sum().sort_values(by='pagerank', ascending=False).head(10)
pagerank_bar_plot_fig = sns.barplot(data=g, x='user', y='pagerank', hue='user', dodge=False)
pagerank_bar_plot_fig.set(xticklabels=[])
return pagerank_bar_plot_fig.get_figure()
# Display the pagerank bar plot in Streamlit
st.title("PageRank Bar Plot")
st.pyplot(pagerank_bar_plot(df_result))
# Create a function to generate the indegree_outdegree bar plot
def indegree_outdegree_bar_plot(df):
sns.set(rc={'figure.figsize':(16, 9)})
g = df.groupby('user', as_index=False)['indegree_outdegree'].sum().sort_values(by='indegree_outdegree', ascending=False).head(10)
indegree_outdegree_bar_plot_fig = sns.barplot(data=g, x='user', y='indegree_outdegree', hue='user', dodge=False)
indegree_outdegree_bar_plot_fig.set(xticklabels=[])
return indegree_outdegree_bar_plot_fig.get_figure()
# Display the indegree_outdegree bar plot in Streamlit
st.title("In-degree Out-degree Bar Plot")
st.pyplot(indegree_outdegree_bar_plot(df_result))
# Update the existing pagerank_bar_plot function
def pagerank_bar_plot2(df):
sns.set(rc={'figure.figsize':(16, 9)})
g = df.groupby('user', as_index=False)['pagerank'].sum().sort_values(by='pagerank', ascending=False).head(10)
pagerank_bar_plot_fig = sns.barplot(data=g, x='user', y='pagerank', hue='user', dodge=False)
pagerank_bar_plot_fig.set(xticklabels=[])
return pagerank_bar_plot_fig.get_figure()
# Display the updated pagerank bar plot in Streamlit
st.pyplot(pagerank_bar_plot2(df_result))
# # Update the existing indegree_outdegree_bar_plot function
# def indegree_outdegree_bar_plot(df):
# sns.set(rc={'figure.figsize':(16, 9)})
# g = df.groupby('user', as_index=False)['indegree_outdegree'].sum().sort_values(by='indegree_outdegree', ascending=False).head(10)
# indegree_outdegree_bar_plot_fig = sns.barplot(data=g, x='user', y='indegree_outdegree', hue='user', dodge=False)
# indegree_outdegree_bar_plot_fig.set(xticklabels=[])
# return indegree_outdegree_bar_plot_fig.get_figure()
# # Display the updated indegree_outdegree bar plot in Streamlit
# st.pyplot(indegree_outdegree_bar_plot(df_result))