Spaces:
Runtime error
Runtime error
Eleonora Bernasconi commited on
Commit ·
df6bc1d
1
Parent(s): ea361ad
up
Browse files- __pycache__/analysis.cpython-37.pyc +0 -0
- __pycache__/intro.cpython-37.pyc +0 -0
- analysis.py +223 -30
- app.py +13 -1
- intro.py +4 -2
__pycache__/analysis.cpython-37.pyc
CHANGED
|
Binary files a/__pycache__/analysis.cpython-37.pyc and b/__pycache__/analysis.cpython-37.pyc differ
|
|
|
__pycache__/intro.cpython-37.pyc
CHANGED
|
Binary files a/__pycache__/intro.cpython-37.pyc and b/__pycache__/intro.cpython-37.pyc differ
|
|
|
analysis.py
CHANGED
|
@@ -1,56 +1,249 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
import pandas as pd
|
| 3 |
import matplotlib.pyplot as plt
|
|
|
|
| 4 |
import seaborn as sns
|
| 5 |
-
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
def load_data():
|
| 9 |
-
data = pd.read_csv("output/scholar_dblp_semantics.csv",
|
|
|
|
| 10 |
return data
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
def main():
|
|
|
|
|
|
|
| 13 |
data = load_data()
|
| 14 |
|
| 15 |
# Step 1: Display the data retrieved from DBLP
|
| 16 |
step_1 = st.sidebar.checkbox("1 - Display the crawled data")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
if step_1:
|
| 18 |
st.write("Crawled data:")
|
| 19 |
st.write(data)
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
data['max_cit'] = data[['cites_scholar', 'cites_semantic']].max(axis=1)
|
| 27 |
-
citation_source = 'max_cit'
|
| 28 |
|
| 29 |
-
|
| 30 |
-
|
|
|
|
| 31 |
|
| 32 |
-
|
| 33 |
-
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
|
| 42 |
-
|
| 43 |
-
|
| 44 |
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
-
# Display the results
|
| 49 |
-
st.write("Total Citations for Each Author:")
|
| 50 |
-
st.dataframe(author_citation_totals)
|
| 51 |
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
years_with_most_citations = data.groupby('Year')[citation_source].sum().reset_index()
|
| 55 |
-
years_with_most_citations = years_with_most_citations.sort_values(by=citation_source, ascending=False).head(5)
|
| 56 |
-
st.dataframe(years_with_most_citations)
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import pandas as pd
|
| 3 |
import matplotlib.pyplot as plt
|
| 4 |
+
from wordcloud import WordCloud
|
| 5 |
import seaborn as sns
|
| 6 |
+
import gensim
|
| 7 |
+
from gensim import corpora
|
| 8 |
+
from gensim.models.ldamodel import LdaModel
|
| 9 |
+
import pyLDAvis.gensim_models as gensimvis
|
| 10 |
+
import string
|
| 11 |
+
import pyLDAvis
|
| 12 |
+
import nltk
|
| 13 |
+
from nltk.corpus import stopwords
|
| 14 |
+
from nltk.tokenize import word_tokenize
|
| 15 |
+
from nltk.stem import WordNetLemmatizer
|
| 16 |
+
import plotly.express as px
|
| 17 |
+
import plotly.graph_objects as go
|
| 18 |
+
import matplotlib as mpl
|
| 19 |
+
import numpy as np
|
| 20 |
|
| 21 |
def load_data():
|
| 22 |
+
data = pd.read_csv("output/scholar_dblp_semantics.csv",
|
| 23 |
+
sep=";", encoding="utf-8")
|
| 24 |
return data
|
| 25 |
|
| 26 |
+
|
| 27 |
+
def preprocess_text(text):
|
| 28 |
+
# Convert text to lowercase
|
| 29 |
+
text = text.lower()
|
| 30 |
+
|
| 31 |
+
# Tokenize the text into words
|
| 32 |
+
words = word_tokenize(text)
|
| 33 |
+
|
| 34 |
+
# Remove stopwords and punctuation
|
| 35 |
+
stop_words = set(stopwords.words('english'))
|
| 36 |
+
words = [
|
| 37 |
+
word for word in words if word not in stop_words and word not in string.punctuation]
|
| 38 |
+
|
| 39 |
+
# Lemmatize words
|
| 40 |
+
lemmatizer = WordNetLemmatizer()
|
| 41 |
+
words = [lemmatizer.lemmatize(word) for word in words]
|
| 42 |
+
|
| 43 |
+
# Remove "'s" and "'nt" from words
|
| 44 |
+
words = [word.replace("'s", "").replace("n't", "").replace(
|
| 45 |
+
"'d", "").replace("'m", "").replace('"', "") for word in words]
|
| 46 |
+
|
| 47 |
+
return words
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def train_lda_model(corpus, dictionary, num_topics, passes=15):
|
| 51 |
+
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=passes, alpha='auto')
|
| 52 |
+
return lda_model
|
| 53 |
+
|
| 54 |
def main():
|
| 55 |
+
st.write("# Analysis of Knowledge")
|
| 56 |
+
|
| 57 |
data = load_data()
|
| 58 |
|
| 59 |
# Step 1: Display the data retrieved from DBLP
|
| 60 |
step_1 = st.sidebar.checkbox("1 - Display the crawled data")
|
| 61 |
+
# Step 2: Select Citation Source
|
| 62 |
+
step_2 = st.sidebar.checkbox("2 - Select Citation Source")
|
| 63 |
+
# Step 3: Elaborate Analysis
|
| 64 |
+
step_3 = st.sidebar.checkbox("3 - Elaborate Analysis")
|
| 65 |
+
# Step 4: Analyze Topics
|
| 66 |
+
step_4 = st.sidebar.checkbox("4 - Analyze Topics")
|
| 67 |
+
|
| 68 |
if step_1:
|
| 69 |
st.write("Crawled data:")
|
| 70 |
st.write(data)
|
| 71 |
|
| 72 |
+
if step_2:
|
| 73 |
+
# Choose whether to calculate citations from 'cites_scholar', 'cites_semantic', or 'max_cit'
|
| 74 |
+
citation_source = st.selectbox("Select Citation Source", ['cites_scholar', 'cites_semantic', 'max_cit'])
|
| 75 |
+
|
| 76 |
+
# Calculate citations based on the selected source
|
| 77 |
+
if citation_source == 'max_cit':
|
| 78 |
+
data['max_cit'] = data[['cites_scholar','cites_semantic']].max(axis=1)
|
| 79 |
+
citation_source = 'max_cit'
|
| 80 |
+
|
| 81 |
+
if step_3:
|
| 82 |
+
# Group data by author and sum citations for each author
|
| 83 |
+
authors_citation_counts = data.groupby(
|
| 84 |
+
'Authors')[citation_source].sum().reset_index()
|
| 85 |
+
|
| 86 |
+
# Sort authors by descending citations
|
| 87 |
+
top_authors = authors_citation_counts.sort_values(
|
| 88 |
+
by=citation_source, ascending=False)
|
| 89 |
+
|
| 90 |
+
# Separate authors and calculate citations for each author
|
| 91 |
+
author_citation_counts = pd.DataFrame(columns=['Authors', citation_source])
|
| 92 |
+
|
| 93 |
+
for _, row in top_authors.iterrows():
|
| 94 |
+
authors = row['Authors'].split(', ')
|
| 95 |
+
citations = row[citation_source]
|
| 96 |
+
|
| 97 |
+
for author in authors:
|
| 98 |
+
author_citation_counts = author_citation_counts.append(
|
| 99 |
+
{'Author': author, 'Total Citations': citations}, ignore_index=True)
|
| 100 |
+
|
| 101 |
+
# Group by author and sum the citations
|
| 102 |
+
author_citation_totals = author_citation_counts.groupby(
|
| 103 |
+
'Author')['Total Citations'].sum().reset_index()
|
| 104 |
+
|
| 105 |
+
# Display the results
|
| 106 |
+
st.write("Total Citations for Each Author:")
|
| 107 |
+
st.dataframe(author_citation_totals)
|
| 108 |
+
|
| 109 |
+
# Find the years in which papers received the most citations
|
| 110 |
+
st.write("\nYears in which papers received the most citations:")
|
| 111 |
+
years_with_most_citations = data.groupby(
|
| 112 |
+
'Year')[citation_source].sum().reset_index()
|
| 113 |
+
years_with_most_citations = years_with_most_citations.sort_values(
|
| 114 |
+
by=citation_source, ascending=False).head(5)
|
| 115 |
+
st.write(years_with_most_citations)
|
| 116 |
+
|
| 117 |
+
if step_4:
|
| 118 |
+
# Add a sidebar to your Streamlit app
|
| 119 |
+
st.sidebar.title("Topic Modeling Settings")
|
| 120 |
+
|
| 121 |
+
# Create a number input for selecting the number of topics
|
| 122 |
+
num_topics = st.sidebar.number_input(
|
| 123 |
+
"Number of Topics", min_value=1, value=5, step=1)
|
| 124 |
+
|
| 125 |
+
# Display the selected number of topics
|
| 126 |
+
st.write(f"You have selected {num_topics} topics for topic modeling.")
|
| 127 |
+
|
| 128 |
+
# Create a list of titles for topic analysis
|
| 129 |
+
titles = data['Title'].tolist()
|
| 130 |
+
|
| 131 |
+
# Preprocess titles (tokenization, stop words removal, etc.)
|
| 132 |
+
preprocessed_titles = [preprocess_text(title) for title in titles]
|
| 133 |
+
|
| 134 |
+
# Create a dictionary and a corpus
|
| 135 |
+
dictionary = corpora.Dictionary(preprocessed_titles)
|
| 136 |
+
corpus = [dictionary.doc2bow(title_tokens) for title_tokens in preprocessed_titles]
|
| 137 |
+
|
| 138 |
+
# Build an LDA model
|
| 139 |
+
lda_model = LdaModel(corpus=corpus, id2word=dictionary,num_topics=num_topics, passes=15, alpha='auto')
|
| 140 |
+
|
| 141 |
+
# Visualizza i topic con PyLDAvis
|
| 142 |
+
vis_data = pyLDAvis.gensim_models.prepare(
|
| 143 |
+
lda_model, corpus, dictionary)
|
| 144 |
+
html_string = pyLDAvis.prepared_data_to_html(vis_data)
|
| 145 |
+
|
| 146 |
+
st.subheader("LDAVis Topic Visualization")
|
| 147 |
+
st.markdown('<div style="padding: 20px 5px;"></div>', unsafe_allow_html=True)
|
| 148 |
+
st.components.v1.html(html_string, height=1100, width=1400)
|
| 149 |
+
|
| 150 |
+
# Group data by year and train an LDA model for each year
|
| 151 |
+
years = sorted(data['Year'].unique()) # Sort the years in ascending order
|
| 152 |
+
lda_models = {}
|
| 153 |
+
|
| 154 |
+
for year in years:
|
| 155 |
+
year_data = data[data['Year'] == year]
|
| 156 |
+
year_titles = year_data['Title'].tolist()
|
| 157 |
+
preprocessed_year_titles = [
|
| 158 |
+
preprocess_text(title) for title in year_titles]
|
| 159 |
+
year_corpus = [dictionary.doc2bow(
|
| 160 |
+
title_tokens) for title_tokens in preprocessed_year_titles]
|
| 161 |
+
|
| 162 |
+
# Train an LDA model for the current year
|
| 163 |
+
lda_model = train_lda_model(
|
| 164 |
+
year_corpus, dictionary, num_topics=num_topics, passes=15)
|
| 165 |
+
lda_models[year] = lda_model
|
| 166 |
+
|
| 167 |
+
# Crea un DataFrame con i dati aggregati dei topic per ogni anno
|
| 168 |
+
topic_data = {}
|
| 169 |
+
for year, lda_model in lda_models.items():
|
| 170 |
+
year_topics = lda_model.show_topics(num_words=5)
|
| 171 |
+
topics = [f"Topic {topic[0]}" for topic in year_topics]
|
| 172 |
+
topic_weights = [sum(float(w.split('*')[0])
|
| 173 |
+
for w in topic[1].split(" + ")) for topic in year_topics]
|
| 174 |
+
topic_data[year] = dict(zip(topics, topic_weights))
|
| 175 |
+
|
| 176 |
+
df = pd.DataFrame(topic_data).T
|
| 177 |
+
|
| 178 |
+
# Create a list of years for x-axis
|
| 179 |
+
years_list = [str(year) for year in df.index]
|
| 180 |
+
colorpalette = 'gnuplot'
|
| 181 |
+
# Generate a color palette using Seaborn
|
| 182 |
+
rgb_colors = sns.color_palette(colorpalette, num_topics)
|
| 183 |
+
|
| 184 |
+
# Convert RGB colors to RGB hex codes
|
| 185 |
+
topic_colors = list(map(mpl.colors.rgb2hex, rgb_colors))
|
| 186 |
+
|
| 187 |
+
# Crea un DataFrame con i dati aggregati dei topic per ogni anno
|
| 188 |
+
topic_data = {}
|
| 189 |
+
for year, lda_model in lda_models.items():
|
| 190 |
+
year_topics = lda_model.show_topics(num_words=5)
|
| 191 |
+
topics = [f"Topic {topic[0]}" for topic in year_topics]
|
| 192 |
+
topic_weights = [sum(float(w.split('*')[0])
|
| 193 |
+
for w in topic[1].split(" + ")) for topic in year_topics]
|
| 194 |
+
topic_data[year] = dict(zip(topics, topic_weights))
|
| 195 |
+
|
| 196 |
+
df = pd.DataFrame(topic_data).T
|
| 197 |
+
|
| 198 |
+
# Calcola la tendenza dei pesi dei topic nel tempo
|
| 199 |
+
trend_data = {}
|
| 200 |
+
for topic in topics:
|
| 201 |
+
topic_values = df[topic].values
|
| 202 |
+
trend = "Increasing" if topic_values[-1] > topic_values[0] else "Decreasing"
|
| 203 |
+
trend_data[topic] = trend
|
| 204 |
+
|
| 205 |
+
# Crea un DataFrame con le tendenze dei topic
|
| 206 |
+
# trend_df = pd.DataFrame(trend_data, index=['Trend']).T
|
| 207 |
+
|
| 208 |
+
# # Visualizza le tendenze dei topic
|
| 209 |
+
# st.write("Trends of Topics Over Years:")
|
| 210 |
+
# st.write(trend_df)
|
| 211 |
+
|
| 212 |
+
# Create a stacked bar chart
|
| 213 |
+
st.write("# Evolution of Topics Over Years")
|
| 214 |
|
| 215 |
+
# Create an interactive Plotly line chart for all topics with consistent colors
|
| 216 |
+
fig = go.Figure()
|
|
|
|
|
|
|
| 217 |
|
| 218 |
+
for i, topic in enumerate(topics):
|
| 219 |
+
fig.add_trace(go.Scatter(x=years_list, y=df[topic], mode='lines',
|
| 220 |
+
name=f"Topic {i+1}", line=dict(color=topic_colors[i])))
|
| 221 |
|
| 222 |
+
# Calcola la linea di tendenza (ad esempio, regressione lineare) per il topic
|
| 223 |
+
years_list_numeric = [int(year) for year in years_list]
|
| 224 |
|
| 225 |
+
z = np.polyfit(years_list_numeric, df[topic], 1)
|
| 226 |
+
p = np.poly1d(z)
|
| 227 |
+
fig.add_trace(go.Scatter(x=years_list_numeric, y=p(years_list_numeric),
|
| 228 |
+
mode='lines', name=f"Trend {i+1}: {trend_data[topic]}",
|
| 229 |
+
line=dict(color=topic_colors[i], dash='dash')))
|
| 230 |
|
| 231 |
+
# Imposta il layout del grafico
|
| 232 |
+
fig.update_layout(xaxis_title='Year', yaxis_title='Topic Weight',
|
| 233 |
+
title='Evolution of Topics Over Years', xaxis=dict(type='category'))
|
| 234 |
|
| 235 |
+
# Visualizza il grafico
|
| 236 |
+
st.plotly_chart(fig)
|
| 237 |
|
| 238 |
+
# Visualizza le Word Cloud per ciascun anno
|
| 239 |
+
for i, (year, lda_model) in enumerate(lda_models.items()):
|
| 240 |
+
# Crea e visualizza la Word Cloud
|
| 241 |
+
year_topics = lda_model.show_topics(num_words=5)
|
| 242 |
+
topics_words = [word for topic in year_topics for word in topic[1].split(" + ")]
|
| 243 |
+
wordcloud = WordCloud(width=800, height=400, background_color='white', colormap=colorpalette).generate(" ".join(topics_words))
|
| 244 |
+
st.subheader(f"Word Cloud for Year {year}")
|
| 245 |
+
st.image(wordcloud.to_image())
|
| 246 |
|
|
|
|
|
|
|
|
|
|
| 247 |
|
| 248 |
+
if __name__ == "__main__":
|
| 249 |
+
main()
|
|
|
|
|
|
|
|
|
app.py
CHANGED
|
@@ -4,6 +4,17 @@ import knowledge_extraction
|
|
| 4 |
import analysis
|
| 5 |
import merge
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
page_names_to_funcs = {
|
| 9 |
"Welcome": intro.intro,
|
|
@@ -12,5 +23,6 @@ page_names_to_funcs = {
|
|
| 12 |
"Analysis": analysis.main,
|
| 13 |
}
|
| 14 |
|
| 15 |
-
demo_name = st.sidebar.selectbox("
|
|
|
|
| 16 |
page_names_to_funcs[demo_name]()
|
|
|
|
| 4 |
import analysis
|
| 5 |
import merge
|
| 6 |
|
| 7 |
+
st.set_page_config(
|
| 8 |
+
page_title="IRCDL Conference",
|
| 9 |
+
page_icon="📖",
|
| 10 |
+
layout="wide",
|
| 11 |
+
initial_sidebar_state="expanded",
|
| 12 |
+
menu_items={
|
| 13 |
+
'Get Help': 'https://ircdl2024.dei.unipd.it/',
|
| 14 |
+
'Report a bug': "https://ircdl2024.dei.unipd.it/",
|
| 15 |
+
'About': "# IRCDL Conference: A Two-Decade Bibliometric Journey Through Digital Libraries Research"
|
| 16 |
+
}
|
| 17 |
+
)
|
| 18 |
|
| 19 |
page_names_to_funcs = {
|
| 20 |
"Welcome": intro.intro,
|
|
|
|
| 23 |
"Analysis": analysis.main,
|
| 24 |
}
|
| 25 |
|
| 26 |
+
demo_name = st.sidebar.selectbox("IRCDL Conference: A Two-Decade Bibliometric Journey Through Digital Libraries Research", page_names_to_funcs.keys())
|
| 27 |
+
|
| 28 |
page_names_to_funcs[demo_name]()
|
intro.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
-
import streamlit
|
| 2 |
|
| 3 |
def intro():
|
| 4 |
-
import streamlit as st
|
| 5 |
|
| 6 |
st.write("# IRCDL Conference: A Two-Decade Bibliometric Journey Through Digital Libraries Research")
|
| 7 |
st.sidebar.success("Select a phase")
|
|
@@ -11,3 +11,5 @@ def intro():
|
|
| 11 |
IRCDL site: https://ircdl2024.dei.unipd.it/
|
| 12 |
"""
|
| 13 |
)
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
|
| 3 |
def intro():
|
| 4 |
+
import streamlit as st
|
| 5 |
|
| 6 |
st.write("# IRCDL Conference: A Two-Decade Bibliometric Journey Through Digital Libraries Research")
|
| 7 |
st.sidebar.success("Select a phase")
|
|
|
|
| 11 |
IRCDL site: https://ircdl2024.dei.unipd.it/
|
| 12 |
"""
|
| 13 |
)
|
| 14 |
+
|
| 15 |
+
|