Eleonora Bernasconi commited on
Commit
df6bc1d
·
1 Parent(s): ea361ad
__pycache__/analysis.cpython-37.pyc CHANGED
Binary files a/__pycache__/analysis.cpython-37.pyc and b/__pycache__/analysis.cpython-37.pyc differ
 
__pycache__/intro.cpython-37.pyc CHANGED
Binary files a/__pycache__/intro.cpython-37.pyc and b/__pycache__/intro.cpython-37.pyc differ
 
analysis.py CHANGED
@@ -1,56 +1,249 @@
1
  import streamlit as st
2
  import pandas as pd
3
  import matplotlib.pyplot as plt
 
4
  import seaborn as sns
5
-
6
- st.write("# Analysis of Knowledge")
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  def load_data():
9
- data = pd.read_csv("output/scholar_dblp_semantics.csv", sep=";", encoding="utf-8")
 
10
  return data
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def main():
 
 
13
  data = load_data()
14
 
15
  # Step 1: Display the data retrieved from DBLP
16
  step_1 = st.sidebar.checkbox("1 - Display the crawled data")
 
 
 
 
 
 
 
17
  if step_1:
18
  st.write("Crawled data:")
19
  st.write(data)
20
 
21
- # Choose whether to calculate citations from 'cites_scholar', 'cites_semantic', or 'max_cit'
22
- citation_source = st.selectbox("Select Citation Source", ['cites_scholar', 'cites_semantic', 'max_cit'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- # Calculate citations based on the selected source
25
- if citation_source == 'max_cit':
26
- data['max_cit'] = data[['cites_scholar', 'cites_semantic']].max(axis=1)
27
- citation_source = 'max_cit'
28
 
29
- # Group data by author and sum citations for each author
30
- authors_citation_counts = data.groupby('Authors')[citation_source].sum().reset_index()
 
31
 
32
- # Sort authors by descending citations
33
- top_authors = authors_citation_counts.sort_values(by=citation_source, ascending=False)
34
 
35
- # Separate authors and calculate citations for each author
36
- author_citation_counts = pd.DataFrame(columns=['Authors', citation_source])
 
 
 
37
 
38
- for _, row in top_authors.iterrows():
39
- authors = row['Authors'].split(', ')
40
- citations = row[citation_source]
41
 
42
- for author in authors:
43
- author_citation_counts = author_citation_counts.append({'Author': author, 'Total Citations': citations}, ignore_index=True)
44
 
45
- # Group by author and sum the citations
46
- author_citation_totals = author_citation_counts.groupby('Author')['Total Citations'].sum().reset_index()
 
 
 
 
 
 
47
 
48
- # Display the results
49
- st.write("Total Citations for Each Author:")
50
- st.dataframe(author_citation_totals)
51
 
52
- # Find the years in which papers received the most citations
53
- st.write("\nYears in which papers received the most citations:")
54
- years_with_most_citations = data.groupby('Year')[citation_source].sum().reset_index()
55
- years_with_most_citations = years_with_most_citations.sort_values(by=citation_source, ascending=False).head(5)
56
- st.dataframe(years_with_most_citations)
 
1
  import streamlit as st
2
  import pandas as pd
3
  import matplotlib.pyplot as plt
4
+ from wordcloud import WordCloud
5
  import seaborn as sns
6
+ import gensim
7
+ from gensim import corpora
8
+ from gensim.models.ldamodel import LdaModel
9
+ import pyLDAvis.gensim_models as gensimvis
10
+ import string
11
+ import pyLDAvis
12
+ import nltk
13
+ from nltk.corpus import stopwords
14
+ from nltk.tokenize import word_tokenize
15
+ from nltk.stem import WordNetLemmatizer
16
+ import plotly.express as px
17
+ import plotly.graph_objects as go
18
+ import matplotlib as mpl
19
+ import numpy as np
20
 
21
  def load_data():
22
+ data = pd.read_csv("output/scholar_dblp_semantics.csv",
23
+ sep=";", encoding="utf-8")
24
  return data
25
 
26
+
27
+ def preprocess_text(text):
28
+ # Convert text to lowercase
29
+ text = text.lower()
30
+
31
+ # Tokenize the text into words
32
+ words = word_tokenize(text)
33
+
34
+ # Remove stopwords and punctuation
35
+ stop_words = set(stopwords.words('english'))
36
+ words = [
37
+ word for word in words if word not in stop_words and word not in string.punctuation]
38
+
39
+ # Lemmatize words
40
+ lemmatizer = WordNetLemmatizer()
41
+ words = [lemmatizer.lemmatize(word) for word in words]
42
+
43
+ # Remove "'s" and "'nt" from words
44
+ words = [word.replace("'s", "").replace("n't", "").replace(
45
+ "'d", "").replace("'m", "").replace('"', "") for word in words]
46
+
47
+ return words
48
+
49
+
50
+ def train_lda_model(corpus, dictionary, num_topics, passes=15):
51
+ lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=passes, alpha='auto')
52
+ return lda_model
53
+
54
  def main():
55
+ st.write("# Analysis of Knowledge")
56
+
57
  data = load_data()
58
 
59
  # Step 1: Display the data retrieved from DBLP
60
  step_1 = st.sidebar.checkbox("1 - Display the crawled data")
61
+ # Step 2: Select Citation Source
62
+ step_2 = st.sidebar.checkbox("2 - Select Citation Source")
63
+ # Step 3: Elaborate Analysis
64
+ step_3 = st.sidebar.checkbox("3 - Elaborate Analysis")
65
+ # Step 4: Analyze Topics
66
+ step_4 = st.sidebar.checkbox("4 - Analyze Topics")
67
+
68
  if step_1:
69
  st.write("Crawled data:")
70
  st.write(data)
71
 
72
+ if step_2:
73
+ # Choose whether to calculate citations from 'cites_scholar', 'cites_semantic', or 'max_cit'
74
+ citation_source = st.selectbox("Select Citation Source", ['cites_scholar', 'cites_semantic', 'max_cit'])
75
+
76
+ # Calculate citations based on the selected source
77
+ if citation_source == 'max_cit':
78
+ data['max_cit'] = data[['cites_scholar','cites_semantic']].max(axis=1)
79
+ citation_source = 'max_cit'
80
+
81
+ if step_3:
82
+ # Group data by author and sum citations for each author
83
+ authors_citation_counts = data.groupby(
84
+ 'Authors')[citation_source].sum().reset_index()
85
+
86
+ # Sort authors by descending citations
87
+ top_authors = authors_citation_counts.sort_values(
88
+ by=citation_source, ascending=False)
89
+
90
+ # Separate authors and calculate citations for each author
91
+ author_citation_counts = pd.DataFrame(columns=['Authors', citation_source])
92
+
93
+ for _, row in top_authors.iterrows():
94
+ authors = row['Authors'].split(', ')
95
+ citations = row[citation_source]
96
+
97
+ for author in authors:
98
+ author_citation_counts = author_citation_counts.append(
99
+ {'Author': author, 'Total Citations': citations}, ignore_index=True)
100
+
101
+ # Group by author and sum the citations
102
+ author_citation_totals = author_citation_counts.groupby(
103
+ 'Author')['Total Citations'].sum().reset_index()
104
+
105
+ # Display the results
106
+ st.write("Total Citations for Each Author:")
107
+ st.dataframe(author_citation_totals)
108
+
109
+ # Find the years in which papers received the most citations
110
+ st.write("\nYears in which papers received the most citations:")
111
+ years_with_most_citations = data.groupby(
112
+ 'Year')[citation_source].sum().reset_index()
113
+ years_with_most_citations = years_with_most_citations.sort_values(
114
+ by=citation_source, ascending=False).head(5)
115
+ st.write(years_with_most_citations)
116
+
117
+ if step_4:
118
+ # Add a sidebar to your Streamlit app
119
+ st.sidebar.title("Topic Modeling Settings")
120
+
121
+ # Create a number input for selecting the number of topics
122
+ num_topics = st.sidebar.number_input(
123
+ "Number of Topics", min_value=1, value=5, step=1)
124
+
125
+ # Display the selected number of topics
126
+ st.write(f"You have selected {num_topics} topics for topic modeling.")
127
+
128
+ # Create a list of titles for topic analysis
129
+ titles = data['Title'].tolist()
130
+
131
+ # Preprocess titles (tokenization, stop words removal, etc.)
132
+ preprocessed_titles = [preprocess_text(title) for title in titles]
133
+
134
+ # Create a dictionary and a corpus
135
+ dictionary = corpora.Dictionary(preprocessed_titles)
136
+ corpus = [dictionary.doc2bow(title_tokens) for title_tokens in preprocessed_titles]
137
+
138
+ # Build an LDA model
139
+ lda_model = LdaModel(corpus=corpus, id2word=dictionary,num_topics=num_topics, passes=15, alpha='auto')
140
+
141
+ # Visualizza i topic con PyLDAvis
142
+ vis_data = pyLDAvis.gensim_models.prepare(
143
+ lda_model, corpus, dictionary)
144
+ html_string = pyLDAvis.prepared_data_to_html(vis_data)
145
+
146
+ st.subheader("LDAVis Topic Visualization")
147
+ st.markdown('<div style="padding: 20px 5px;"></div>', unsafe_allow_html=True)
148
+ st.components.v1.html(html_string, height=1100, width=1400)
149
+
150
+ # Group data by year and train an LDA model for each year
151
+ years = sorted(data['Year'].unique()) # Sort the years in ascending order
152
+ lda_models = {}
153
+
154
+ for year in years:
155
+ year_data = data[data['Year'] == year]
156
+ year_titles = year_data['Title'].tolist()
157
+ preprocessed_year_titles = [
158
+ preprocess_text(title) for title in year_titles]
159
+ year_corpus = [dictionary.doc2bow(
160
+ title_tokens) for title_tokens in preprocessed_year_titles]
161
+
162
+ # Train an LDA model for the current year
163
+ lda_model = train_lda_model(
164
+ year_corpus, dictionary, num_topics=num_topics, passes=15)
165
+ lda_models[year] = lda_model
166
+
167
+ # Crea un DataFrame con i dati aggregati dei topic per ogni anno
168
+ topic_data = {}
169
+ for year, lda_model in lda_models.items():
170
+ year_topics = lda_model.show_topics(num_words=5)
171
+ topics = [f"Topic {topic[0]}" for topic in year_topics]
172
+ topic_weights = [sum(float(w.split('*')[0])
173
+ for w in topic[1].split(" + ")) for topic in year_topics]
174
+ topic_data[year] = dict(zip(topics, topic_weights))
175
+
176
+ df = pd.DataFrame(topic_data).T
177
+
178
+ # Create a list of years for x-axis
179
+ years_list = [str(year) for year in df.index]
180
+ colorpalette = 'gnuplot'
181
+ # Generate a color palette using Seaborn
182
+ rgb_colors = sns.color_palette(colorpalette, num_topics)
183
+
184
+ # Convert RGB colors to RGB hex codes
185
+ topic_colors = list(map(mpl.colors.rgb2hex, rgb_colors))
186
+
187
+ # Crea un DataFrame con i dati aggregati dei topic per ogni anno
188
+ topic_data = {}
189
+ for year, lda_model in lda_models.items():
190
+ year_topics = lda_model.show_topics(num_words=5)
191
+ topics = [f"Topic {topic[0]}" for topic in year_topics]
192
+ topic_weights = [sum(float(w.split('*')[0])
193
+ for w in topic[1].split(" + ")) for topic in year_topics]
194
+ topic_data[year] = dict(zip(topics, topic_weights))
195
+
196
+ df = pd.DataFrame(topic_data).T
197
+
198
+ # Calcola la tendenza dei pesi dei topic nel tempo
199
+ trend_data = {}
200
+ for topic in topics:
201
+ topic_values = df[topic].values
202
+ trend = "Increasing" if topic_values[-1] > topic_values[0] else "Decreasing"
203
+ trend_data[topic] = trend
204
+
205
+ # Crea un DataFrame con le tendenze dei topic
206
+ # trend_df = pd.DataFrame(trend_data, index=['Trend']).T
207
+
208
+ # # Visualizza le tendenze dei topic
209
+ # st.write("Trends of Topics Over Years:")
210
+ # st.write(trend_df)
211
+
212
+ # Create a stacked bar chart
213
+ st.write("# Evolution of Topics Over Years")
214
 
215
+ # Create an interactive Plotly line chart for all topics with consistent colors
216
+ fig = go.Figure()
 
 
217
 
218
+ for i, topic in enumerate(topics):
219
+ fig.add_trace(go.Scatter(x=years_list, y=df[topic], mode='lines',
220
+ name=f"Topic {i+1}", line=dict(color=topic_colors[i])))
221
 
222
+ # Calcola la linea di tendenza (ad esempio, regressione lineare) per il topic
223
+ years_list_numeric = [int(year) for year in years_list]
224
 
225
+ z = np.polyfit(years_list_numeric, df[topic], 1)
226
+ p = np.poly1d(z)
227
+ fig.add_trace(go.Scatter(x=years_list_numeric, y=p(years_list_numeric),
228
+ mode='lines', name=f"Trend {i+1}: {trend_data[topic]}",
229
+ line=dict(color=topic_colors[i], dash='dash')))
230
 
231
+ # Imposta il layout del grafico
232
+ fig.update_layout(xaxis_title='Year', yaxis_title='Topic Weight',
233
+ title='Evolution of Topics Over Years', xaxis=dict(type='category'))
234
 
235
+ # Visualizza il grafico
236
+ st.plotly_chart(fig)
237
 
238
+ # Visualizza le Word Cloud per ciascun anno
239
+ for i, (year, lda_model) in enumerate(lda_models.items()):
240
+ # Crea e visualizza la Word Cloud
241
+ year_topics = lda_model.show_topics(num_words=5)
242
+ topics_words = [word for topic in year_topics for word in topic[1].split(" + ")]
243
+ wordcloud = WordCloud(width=800, height=400, background_color='white', colormap=colorpalette).generate(" ".join(topics_words))
244
+ st.subheader(f"Word Cloud for Year {year}")
245
+ st.image(wordcloud.to_image())
246
 
 
 
 
247
 
248
+ if __name__ == "__main__":
249
+ main()
 
 
 
app.py CHANGED
@@ -4,6 +4,17 @@ import knowledge_extraction
4
  import analysis
5
  import merge
6
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  page_names_to_funcs = {
9
  "Welcome": intro.intro,
@@ -12,5 +23,6 @@ page_names_to_funcs = {
12
  "Analysis": analysis.main,
13
  }
14
 
15
- demo_name = st.sidebar.selectbox("Choose a phase", page_names_to_funcs.keys())
 
16
  page_names_to_funcs[demo_name]()
 
4
  import analysis
5
  import merge
6
 
7
+ st.set_page_config(
8
+ page_title="IRCDL Conference",
9
+ page_icon="📖",
10
+ layout="wide",
11
+ initial_sidebar_state="expanded",
12
+ menu_items={
13
+ 'Get Help': 'https://ircdl2024.dei.unipd.it/',
14
+ 'Report a bug': "https://ircdl2024.dei.unipd.it/",
15
+ 'About': "# IRCDL Conference: A Two-Decade Bibliometric Journey Through Digital Libraries Research"
16
+ }
17
+ )
18
 
19
  page_names_to_funcs = {
20
  "Welcome": intro.intro,
 
23
  "Analysis": analysis.main,
24
  }
25
 
26
+ demo_name = st.sidebar.selectbox("IRCDL Conference: A Two-Decade Bibliometric Journey Through Digital Libraries Research", page_names_to_funcs.keys())
27
+
28
  page_names_to_funcs[demo_name]()
intro.py CHANGED
@@ -1,7 +1,7 @@
1
- import streamlit
2
 
3
  def intro():
4
- import streamlit as st
5
 
6
  st.write("# IRCDL Conference: A Two-Decade Bibliometric Journey Through Digital Libraries Research")
7
  st.sidebar.success("Select a phase")
@@ -11,3 +11,5 @@ def intro():
11
  IRCDL site: https://ircdl2024.dei.unipd.it/
12
  """
13
  )
 
 
 
1
+ import streamlit as st
2
 
3
  def intro():
4
+ import streamlit as st
5
 
6
  st.write("# IRCDL Conference: A Two-Decade Bibliometric Journey Through Digital Libraries Research")
7
  st.sidebar.success("Select a phase")
 
11
  IRCDL site: https://ircdl2024.dei.unipd.it/
12
  """
13
  )
14
+
15
+