Spaces:
Runtime error
Runtime error
File size: 10,294 Bytes
ea361ad df6bc1d ea361ad df6bc1d ea361ad dfb6640 ea361ad df6bc1d ea361ad df6bc1d ce81144 ea361ad bd2c1ff def4987 dfb6640 df6bc1d ea361ad df6bc1d ea361ad ce81144 ea361ad df6bc1d bb43809 df6bc1d bb43809 df6bc1d c9a8c95 df6bc1d c9a8c95 df6bc1d ea361ad df6bc1d ea361ad df6bc1d ea361ad df6bc1d ea361ad df6bc1d ea361ad df6bc1d ea361ad df6bc1d ea361ad df6bc1d ea361ad df6bc1d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 | import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
import gensim
from gensim import corpora
from gensim.models.ldamodel import LdaModel
import pyLDAvis.gensim_models as gensimvis
import string
import pyLDAvis
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import plotly.graph_objects as go
import matplotlib as mpl
import numpy as np
def load_data():
data = pd.read_csv("output/scholar_dblp_semantics.csv",
sep=";", encoding="utf-8")
return data
def preprocess_text(text):
# Convert text to lowercase
text = text.lower()
# Tokenize the text into words
words = word_tokenize(text)
# Remove stopwords and punctuation
stop_words = set(stopwords.words('english'))
words = [
word for word in words if word not in stop_words and word not in string.punctuation]
# Lemmatize words
lemmatizer = WordNetLemmatizer()
words = [lemmatizer.lemmatize(word) for word in words]
# Remove "'s" and "'nt" from words
words = [word.replace("'s", "").replace("n't", "").replace(
"'d", "").replace("'m", "").replace('"', "") for word in words]
return words
def train_lda_model(corpus, dictionary, num_topics, passes=15):
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=passes, alpha='auto')
return lda_model
def doi_def(row):
if row['DOI'] != 'None':
return row['DOI']
elif row['doi_scholar'] != 'None':
return row['doi_scholar']
elif row['doi_semantic'] != 'None':
return row['doi_semantic']
else:
return 'None'
def main():
# nltk.download('punkt')
# nltk.download('stopwords')
# print(nltk.data.path)
nltk.data.path.append("nltk_data")
st.write("# Analysis of Knowledge")
data = load_data()
# Step 1: Display the data retrieved from DBLP
step_1 = st.sidebar.checkbox("1 - Display the crawled data")
# Step 2: Select Citation Source
step_2 = st.sidebar.checkbox("2 - Select Citation Source")
# Step 3: Elaborate Analysis
step_3 = st.sidebar.checkbox("3 - Elaborate Analysis")
# Step 4: Analyze Topics
step_4 = st.sidebar.checkbox("4 - Analyze Topics")
if step_1:
st.write("Crawled data:")
data['doi_def'] = data.apply(doi_def, axis=1)
count_none = (data['DOI'] == 'None').sum()
total_values = len(data['doi_def'])
data.to_csv('output/doi_def_scholar_dblp_semantics.csv', index=False, sep=';' , encoding='utf-8')
st.write("Number of values in DOI column equal to None / Total values: " + str(count_none) + "/" + str(total_values))
st.write(data)
if step_2:
# Choose whether to calculate citations from 'cites_scholar', 'cites_semantic', or 'max_cit'
citation_source = st.selectbox("Select Citation Source", ['cites_scholar', 'cites_semantic', 'max_cit'])
# Calculate citations based on the selected source
if citation_source == 'max_cit':
data['max_cit'] = data[['cites_scholar','cites_semantic']].max(axis=1)
citation_source = 'max_cit'
if step_3:
# Group data by author and sum citations for each author
authors_citation_counts = data.groupby(
'Authors')[citation_source].sum().reset_index()
# Sort authors by descending citations
top_authors = authors_citation_counts.sort_values(
by=citation_source, ascending=False)
# Separate authors and calculate citations for each author
author_citation_counts = pd.DataFrame(columns=['Authors', citation_source])
rows_to_concat = []
for _, row in top_authors.iterrows():
authors = row['Authors'].split(', ')
citations = row[citation_source]
for author in authors:
rows_to_concat.append({'Author': author, 'Total Citations': citations})
author_citation_counts = pd.concat([author_citation_counts, pd.DataFrame(rows_to_concat)], ignore_index=True)
# Group by author and sum the citations
author_citation_totals = author_citation_counts.groupby(
'Author')['Total Citations'].sum().reset_index()
# Display the results
st.write("Total Citations for Each Author:")
st.dataframe(author_citation_totals)
# Find the years in which papers received the most citations
st.write("\nYears in which papers received the most citations:")
years_with_most_citations = data.groupby(
'Year')[citation_source].sum().reset_index()
years_with_most_citations = years_with_most_citations.sort_values(
by=citation_source, ascending=False).head(5)
st.write(years_with_most_citations)
if step_4:
# Add a sidebar to your Streamlit app
st.sidebar.title("Topic Modeling Settings")
# Create a number input for selecting the number of topics
num_topics = st.sidebar.number_input(
"Number of Topics", min_value=1, value=5, step=1)
# Display the selected number of topics
st.write(f"You have selected {num_topics} topics for topic modeling.")
# Create a list of titles for topic analysis
titles = data['Title'].tolist()
# Preprocess titles (tokenization, stop words removal, etc.)
preprocessed_titles = [preprocess_text(title) for title in titles]
# Create a dictionary and a corpus
dictionary = corpora.Dictionary(preprocessed_titles)
corpus = [dictionary.doc2bow(title_tokens) for title_tokens in preprocessed_titles]
# Build an LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary,num_topics=num_topics, passes=15, alpha='auto')
# Visualizza i topic con PyLDAvis
vis_data = pyLDAvis.gensim_models.prepare(
lda_model, corpus, dictionary)
html_string = pyLDAvis.prepared_data_to_html(vis_data)
st.subheader("LDAVis Topic Visualization")
st.markdown('<div style="padding: 20px 5px;"></div>', unsafe_allow_html=True)
st.components.v1.html(html_string, height=1100, width=1400)
# Group data by year and train an LDA model for each year
years = sorted(data['Year'].unique()) # Sort the years in ascending order
lda_models = {}
for year in years:
year_data = data[data['Year'] == year]
year_titles = year_data['Title'].tolist()
preprocessed_year_titles = [
preprocess_text(title) for title in year_titles]
year_corpus = [dictionary.doc2bow(
title_tokens) for title_tokens in preprocessed_year_titles]
# Train an LDA model for the current year
lda_model = train_lda_model(
year_corpus, dictionary, num_topics=num_topics, passes=15)
lda_models[year] = lda_model
# Crea un DataFrame con i dati aggregati dei topic per ogni anno
topic_data = {}
for year, lda_model in lda_models.items():
year_topics = lda_model.show_topics(num_words=num_topics)
topics = [f"Topic {topic[0]}" for topic in year_topics]
topic_weights = [sum(float(w.split('*')[0])
for w in topic[1].split(" + ")) for topic in year_topics]
topic_data[year] = dict(zip(topics, topic_weights))
df = pd.DataFrame(topic_data).T
# Create a list of years for x-axis
years_list = [str(year) for year in df.index]
colorpalette = 'gnuplot'
# Generate a color palette using Seaborn
rgb_colors = sns.color_palette(colorpalette, num_topics)
# Convert RGB colors to RGB hex codes
topic_colors = list(map(mpl.colors.rgb2hex, rgb_colors))
# Calcola la tendenza dei pesi dei topic nel tempo
trend_data = {}
for topic in topics:
topic_values = df[topic].values
trend = "Increasing" if topic_values[-1] > topic_values[0] else "Decreasing"
trend_data[topic] = trend
# Crea un DataFrame con le tendenze dei topic
# trend_df = pd.DataFrame(trend_data, index=['Trend']).T
# # Visualizza le tendenze dei topic
# st.write("Trends of Topics Over Years:")
# st.write(trend_df)
# Create a stacked bar chart
st.write("# Evolution of Topics Over Years")
# Create an interactive Plotly line chart for all topics with consistent colors
fig = go.Figure()
for i, topic in enumerate(topics):
fig.add_trace(go.Scatter(x=years_list, y=df[topic], mode='lines',
name=f"Topic {i+1}", line=dict(color=topic_colors[i])))
# Calcola la linea di tendenza (ad esempio, regressione lineare) per il topic
years_list_numeric = [int(year) for year in years_list]
z = np.polyfit(years_list_numeric, df[topic], 1)
p = np.poly1d(z)
fig.add_trace(go.Scatter(x=years_list_numeric, y=p(years_list_numeric),
mode='lines', name=f"Trend {i+1}: {trend_data[topic]}",
line=dict(color=topic_colors[i], dash='dash')))
# Imposta il layout del grafico
fig.update_layout(xaxis_title='Year', yaxis_title='Topic Weight',
title='Evolution of Topics Over Years', xaxis=dict(type='category'))
# Visualizza il grafico
st.plotly_chart(fig)
# Visualizza le Word Cloud per ciascun anno
for i, (year, lda_model) in enumerate(lda_models.items()):
# Crea e visualizza la Word Cloud
year_topics = lda_model.show_topics(num_words=5)
topics_words = [word for topic in year_topics for word in topic[1].split(" + ")]
wordcloud = WordCloud(width=800, height=400, background_color='white', colormap=colorpalette).generate(" ".join(topics_words))
st.subheader(f"Word Cloud for Year {year}")
st.image(wordcloud.to_image())
if __name__ == "__main__":
main()
|