File size: 10,294 Bytes
ea361ad
 
 
df6bc1d
ea361ad
df6bc1d
 
 
 
 
 
 
 
 
 
 
 
 
ea361ad
dfb6640
ea361ad
df6bc1d
 
ea361ad
 
df6bc1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce81144
 
 
 
 
 
 
 
 
 
 
ea361ad
bd2c1ff
def4987
 
 
dfb6640
df6bc1d
 
ea361ad
 
 
 
df6bc1d
 
 
 
 
 
 
ea361ad
 
ce81144
 
 
 
 
 
ea361ad
 
df6bc1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb43809
 
df6bc1d
 
 
 
 
bb43809
 
 
 
 
df6bc1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c9a8c95
df6bc1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c9a8c95
df6bc1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea361ad
df6bc1d
 
ea361ad
df6bc1d
 
 
ea361ad
df6bc1d
 
ea361ad
df6bc1d
 
 
 
 
ea361ad
df6bc1d
 
 
ea361ad
df6bc1d
 
ea361ad
df6bc1d
 
 
 
 
 
 
 
ea361ad
 
df6bc1d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
import gensim
from gensim import corpora
from gensim.models.ldamodel import LdaModel
import pyLDAvis.gensim_models as gensimvis
import string
import pyLDAvis
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import plotly.graph_objects as go
import matplotlib as mpl
import numpy as np 


def load_data():
    data = pd.read_csv("output/scholar_dblp_semantics.csv",
                       sep=";", encoding="utf-8")
    return data


def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Tokenize the text into words
    words = word_tokenize(text)

    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    words = [
        word for word in words if word not in stop_words and word not in string.punctuation]

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Remove "'s" and "'nt" from words
    words = [word.replace("'s", "").replace("n't", "").replace(
        "'d", "").replace("'m", "").replace('"', "") for word in words]

    return words


def train_lda_model(corpus, dictionary, num_topics, passes=15):
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=passes, alpha='auto')
    return lda_model


def doi_def(row):
    if row['DOI'] != 'None':
        return row['DOI']
    elif row['doi_scholar'] != 'None':
        return row['doi_scholar']
    elif row['doi_semantic'] != 'None':
        return row['doi_semantic']
    else:
        return 'None'
        
def main():

    # nltk.download('punkt')
    # nltk.download('stopwords')
    # print(nltk.data.path)
    nltk.data.path.append("nltk_data")
    st.write("# Analysis of Knowledge")

    data = load_data()

    # Step 1: Display the data retrieved from DBLP
    step_1 = st.sidebar.checkbox("1 - Display the crawled data")
    # Step 2: Select Citation Source
    step_2 = st.sidebar.checkbox("2 - Select Citation Source")
    # Step 3: Elaborate Analysis
    step_3 = st.sidebar.checkbox("3 - Elaborate Analysis")
    # Step 4: Analyze Topics
    step_4 = st.sidebar.checkbox("4 - Analyze Topics")

    if step_1:
        st.write("Crawled data:")
        data['doi_def'] = data.apply(doi_def, axis=1)
        count_none = (data['DOI'] == 'None').sum()
        total_values = len(data['doi_def'])
        data.to_csv('output/doi_def_scholar_dblp_semantics.csv', index=False, sep=';' , encoding='utf-8')

        st.write("Number of values in DOI column equal to None / Total values: " + str(count_none) + "/" + str(total_values))
        st.write(data)

    if step_2:
        # Choose whether to calculate citations from 'cites_scholar', 'cites_semantic', or 'max_cit'
        citation_source = st.selectbox("Select Citation Source", ['cites_scholar', 'cites_semantic', 'max_cit'])

        # Calculate citations based on the selected source
        if citation_source == 'max_cit':
            data['max_cit'] = data[['cites_scholar','cites_semantic']].max(axis=1)
            citation_source = 'max_cit'

    if step_3:
        # Group data by author and sum citations for each author
        authors_citation_counts = data.groupby(
            'Authors')[citation_source].sum().reset_index()

        # Sort authors by descending citations
        top_authors = authors_citation_counts.sort_values(
            by=citation_source, ascending=False)

        # Separate authors and calculate citations for each author
        author_citation_counts = pd.DataFrame(columns=['Authors', citation_source])

        rows_to_concat = []

        for _, row in top_authors.iterrows():
            authors = row['Authors'].split(', ')
            citations = row[citation_source]

            for author in authors:
                rows_to_concat.append({'Author': author, 'Total Citations': citations})

        author_citation_counts = pd.concat([author_citation_counts, pd.DataFrame(rows_to_concat)], ignore_index=True)


        # Group by author and sum the citations
        author_citation_totals = author_citation_counts.groupby(
            'Author')['Total Citations'].sum().reset_index()

        # Display the results
        st.write("Total Citations for Each Author:")
        st.dataframe(author_citation_totals)

        # Find the years in which papers received the most citations
        st.write("\nYears in which papers received the most citations:")
        years_with_most_citations = data.groupby(
            'Year')[citation_source].sum().reset_index()
        years_with_most_citations = years_with_most_citations.sort_values(
            by=citation_source, ascending=False).head(5)
        st.write(years_with_most_citations)

    if step_4:
        # Add a sidebar to your Streamlit app
        st.sidebar.title("Topic Modeling Settings")

        # Create a number input for selecting the number of topics
        num_topics = st.sidebar.number_input(
            "Number of Topics", min_value=1, value=5, step=1)

        # Display the selected number of topics
        st.write(f"You have selected {num_topics} topics for topic modeling.")

        # Create a list of titles for topic analysis
        titles = data['Title'].tolist()

        # Preprocess titles (tokenization, stop words removal, etc.)
        preprocessed_titles = [preprocess_text(title) for title in titles]

        # Create a dictionary and a corpus
        dictionary = corpora.Dictionary(preprocessed_titles)
        corpus = [dictionary.doc2bow(title_tokens) for title_tokens in preprocessed_titles]

        # Build an LDA model
        lda_model = LdaModel(corpus=corpus, id2word=dictionary,num_topics=num_topics, passes=15, alpha='auto')

        # Visualizza i topic con PyLDAvis
        vis_data = pyLDAvis.gensim_models.prepare(
            lda_model, corpus, dictionary)
        html_string = pyLDAvis.prepared_data_to_html(vis_data)

        st.subheader("LDAVis Topic Visualization")
        st.markdown('<div style="padding: 20px 5px;"></div>', unsafe_allow_html=True)
        st.components.v1.html(html_string, height=1100, width=1400)

        # Group data by year and train an LDA model for each year
        years = sorted(data['Year'].unique())  # Sort the years in ascending order
        lda_models = {}

        for year in years:
            year_data = data[data['Year'] == year]
            year_titles = year_data['Title'].tolist()
            preprocessed_year_titles = [
                preprocess_text(title) for title in year_titles]
            year_corpus = [dictionary.doc2bow(
                title_tokens) for title_tokens in preprocessed_year_titles]

            # Train an LDA model for the current year
            lda_model = train_lda_model(
                year_corpus, dictionary, num_topics=num_topics, passes=15)
            lda_models[year] = lda_model

        # Crea un DataFrame con i dati aggregati dei topic per ogni anno
        topic_data = {}
        for year, lda_model in lda_models.items():
            year_topics = lda_model.show_topics(num_words=num_topics)
            topics = [f"Topic {topic[0]}" for topic in year_topics]
            topic_weights = [sum(float(w.split('*')[0])
                                 for w in topic[1].split(" + ")) for topic in year_topics]
            topic_data[year] = dict(zip(topics, topic_weights))

        df = pd.DataFrame(topic_data).T

        # Create a list of years for x-axis
        years_list = [str(year) for year in df.index]
        colorpalette = 'gnuplot'
        # Generate a color palette using Seaborn
        rgb_colors = sns.color_palette(colorpalette, num_topics)

        # Convert RGB colors to RGB hex codes
        topic_colors = list(map(mpl.colors.rgb2hex, rgb_colors))

       

        # Calcola la tendenza dei pesi dei topic nel tempo
        trend_data = {}
        for topic in topics:
            topic_values = df[topic].values
            trend = "Increasing" if topic_values[-1] > topic_values[0] else "Decreasing"
            trend_data[topic] = trend

        # Crea un DataFrame con le tendenze dei topic
        # trend_df = pd.DataFrame(trend_data, index=['Trend']).T

        # # Visualizza le tendenze dei topic
        # st.write("Trends of Topics Over Years:")
        # st.write(trend_df)

        # Create a stacked bar chart
        st.write("# Evolution of Topics Over Years")

        # Create an interactive Plotly line chart for all topics with consistent colors
        fig = go.Figure()

        for i, topic in enumerate(topics):
            fig.add_trace(go.Scatter(x=years_list, y=df[topic], mode='lines',
                                    name=f"Topic {i+1}", line=dict(color=topic_colors[i])))

            # Calcola la linea di tendenza (ad esempio, regressione lineare) per il topic
            years_list_numeric = [int(year) for year in years_list]

            z = np.polyfit(years_list_numeric, df[topic], 1)
            p = np.poly1d(z)
            fig.add_trace(go.Scatter(x=years_list_numeric, y=p(years_list_numeric),
                                    mode='lines', name=f"Trend {i+1}: {trend_data[topic]}",
                                    line=dict(color=topic_colors[i], dash='dash')))

        # Imposta il layout del grafico
        fig.update_layout(xaxis_title='Year', yaxis_title='Topic Weight',
                        title='Evolution of Topics Over Years', xaxis=dict(type='category'))

        # Visualizza il grafico
        st.plotly_chart(fig)

        # Visualizza le Word Cloud per ciascun anno
        for i, (year, lda_model) in enumerate(lda_models.items()):
            # Crea e visualizza la Word Cloud
            year_topics = lda_model.show_topics(num_words=5)
            topics_words = [word for topic in year_topics for word in topic[1].split(" + ")]
            wordcloud = WordCloud(width=800, height=400, background_color='white', colormap=colorpalette).generate(" ".join(topics_words))
            st.subheader(f"Word Cloud for Year {year}")
            st.image(wordcloud.to_image())


if __name__ == "__main__":
    main()