import streamlit as st import pandas as pd import matplotlib.pyplot as plt from wordcloud import WordCloud import seaborn as sns import gensim from gensim import corpora from gensim.models.ldamodel import LdaModel import pyLDAvis.gensim_models as gensimvis import string import pyLDAvis import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer import plotly.graph_objects as go import matplotlib as mpl import numpy as np def load_data(): data = pd.read_csv("output/scholar_dblp_semantics.csv", sep=";", encoding="utf-8") return data def preprocess_text(text): # Convert text to lowercase text = text.lower() # Tokenize the text into words words = word_tokenize(text) # Remove stopwords and punctuation stop_words = set(stopwords.words('english')) words = [ word for word in words if word not in stop_words and word not in string.punctuation] # Lemmatize words lemmatizer = WordNetLemmatizer() words = [lemmatizer.lemmatize(word) for word in words] # Remove "'s" and "'nt" from words words = [word.replace("'s", "").replace("n't", "").replace( "'d", "").replace("'m", "").replace('"', "") for word in words] return words def train_lda_model(corpus, dictionary, num_topics, passes=15): lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=passes, alpha='auto') return lda_model def doi_def(row): if row['DOI'] != 'None': return row['DOI'] elif row['doi_scholar'] != 'None': return row['doi_scholar'] elif row['doi_semantic'] != 'None': return row['doi_semantic'] else: return 'None' def main(): # nltk.download('punkt') # nltk.download('stopwords') # print(nltk.data.path) nltk.data.path.append("nltk_data") st.write("# Analysis of Knowledge") data = load_data() # Step 1: Display the data retrieved from DBLP step_1 = st.sidebar.checkbox("1 - Display the crawled data") # Step 2: Select Citation Source step_2 = st.sidebar.checkbox("2 - Select Citation Source") # Step 3: Elaborate Analysis step_3 = st.sidebar.checkbox("3 - Elaborate Analysis") # Step 4: Analyze Topics step_4 = st.sidebar.checkbox("4 - Analyze Topics") if step_1: st.write("Crawled data:") data['doi_def'] = data.apply(doi_def, axis=1) count_none = (data['DOI'] == 'None').sum() total_values = len(data['doi_def']) data.to_csv('output/doi_def_scholar_dblp_semantics.csv', index=False, sep=';' , encoding='utf-8') st.write("Number of values in DOI column equal to None / Total values: " + str(count_none) + "/" + str(total_values)) st.write(data) if step_2: # Choose whether to calculate citations from 'cites_scholar', 'cites_semantic', or 'max_cit' citation_source = st.selectbox("Select Citation Source", ['cites_scholar', 'cites_semantic', 'max_cit']) # Calculate citations based on the selected source if citation_source == 'max_cit': data['max_cit'] = data[['cites_scholar','cites_semantic']].max(axis=1) citation_source = 'max_cit' if step_3: # Group data by author and sum citations for each author authors_citation_counts = data.groupby( 'Authors')[citation_source].sum().reset_index() # Sort authors by descending citations top_authors = authors_citation_counts.sort_values( by=citation_source, ascending=False) # Separate authors and calculate citations for each author author_citation_counts = pd.DataFrame(columns=['Authors', citation_source]) rows_to_concat = [] for _, row in top_authors.iterrows(): authors = row['Authors'].split(', ') citations = row[citation_source] for author in authors: rows_to_concat.append({'Author': author, 'Total Citations': citations}) author_citation_counts = pd.concat([author_citation_counts, pd.DataFrame(rows_to_concat)], ignore_index=True) # Group by author and sum the citations author_citation_totals = author_citation_counts.groupby( 'Author')['Total Citations'].sum().reset_index() # Display the results st.write("Total Citations for Each Author:") st.dataframe(author_citation_totals) # Find the years in which papers received the most citations st.write("\nYears in which papers received the most citations:") years_with_most_citations = data.groupby( 'Year')[citation_source].sum().reset_index() years_with_most_citations = years_with_most_citations.sort_values( by=citation_source, ascending=False).head(5) st.write(years_with_most_citations) if step_4: # Add a sidebar to your Streamlit app st.sidebar.title("Topic Modeling Settings") # Create a number input for selecting the number of topics num_topics = st.sidebar.number_input( "Number of Topics", min_value=1, value=5, step=1) # Display the selected number of topics st.write(f"You have selected {num_topics} topics for topic modeling.") # Create a list of titles for topic analysis titles = data['Title'].tolist() # Preprocess titles (tokenization, stop words removal, etc.) preprocessed_titles = [preprocess_text(title) for title in titles] # Create a dictionary and a corpus dictionary = corpora.Dictionary(preprocessed_titles) corpus = [dictionary.doc2bow(title_tokens) for title_tokens in preprocessed_titles] # Build an LDA model lda_model = LdaModel(corpus=corpus, id2word=dictionary,num_topics=num_topics, passes=15, alpha='auto') # Visualizza i topic con PyLDAvis vis_data = pyLDAvis.gensim_models.prepare( lda_model, corpus, dictionary) html_string = pyLDAvis.prepared_data_to_html(vis_data) st.subheader("LDAVis Topic Visualization") st.markdown('
', unsafe_allow_html=True) st.components.v1.html(html_string, height=1100, width=1400) # Group data by year and train an LDA model for each year years = sorted(data['Year'].unique()) # Sort the years in ascending order lda_models = {} for year in years: year_data = data[data['Year'] == year] year_titles = year_data['Title'].tolist() preprocessed_year_titles = [ preprocess_text(title) for title in year_titles] year_corpus = [dictionary.doc2bow( title_tokens) for title_tokens in preprocessed_year_titles] # Train an LDA model for the current year lda_model = train_lda_model( year_corpus, dictionary, num_topics=num_topics, passes=15) lda_models[year] = lda_model # Crea un DataFrame con i dati aggregati dei topic per ogni anno topic_data = {} for year, lda_model in lda_models.items(): year_topics = lda_model.show_topics(num_words=num_topics) topics = [f"Topic {topic[0]}" for topic in year_topics] topic_weights = [sum(float(w.split('*')[0]) for w in topic[1].split(" + ")) for topic in year_topics] topic_data[year] = dict(zip(topics, topic_weights)) df = pd.DataFrame(topic_data).T # Create a list of years for x-axis years_list = [str(year) for year in df.index] colorpalette = 'gnuplot' # Generate a color palette using Seaborn rgb_colors = sns.color_palette(colorpalette, num_topics) # Convert RGB colors to RGB hex codes topic_colors = list(map(mpl.colors.rgb2hex, rgb_colors)) # Calcola la tendenza dei pesi dei topic nel tempo trend_data = {} for topic in topics: topic_values = df[topic].values trend = "Increasing" if topic_values[-1] > topic_values[0] else "Decreasing" trend_data[topic] = trend # Crea un DataFrame con le tendenze dei topic # trend_df = pd.DataFrame(trend_data, index=['Trend']).T # # Visualizza le tendenze dei topic # st.write("Trends of Topics Over Years:") # st.write(trend_df) # Create a stacked bar chart st.write("# Evolution of Topics Over Years") # Create an interactive Plotly line chart for all topics with consistent colors fig = go.Figure() for i, topic in enumerate(topics): fig.add_trace(go.Scatter(x=years_list, y=df[topic], mode='lines', name=f"Topic {i+1}", line=dict(color=topic_colors[i]))) # Calcola la linea di tendenza (ad esempio, regressione lineare) per il topic years_list_numeric = [int(year) for year in years_list] z = np.polyfit(years_list_numeric, df[topic], 1) p = np.poly1d(z) fig.add_trace(go.Scatter(x=years_list_numeric, y=p(years_list_numeric), mode='lines', name=f"Trend {i+1}: {trend_data[topic]}", line=dict(color=topic_colors[i], dash='dash'))) # Imposta il layout del grafico fig.update_layout(xaxis_title='Year', yaxis_title='Topic Weight', title='Evolution of Topics Over Years', xaxis=dict(type='category')) # Visualizza il grafico st.plotly_chart(fig) # Visualizza le Word Cloud per ciascun anno for i, (year, lda_model) in enumerate(lda_models.items()): # Crea e visualizza la Word Cloud year_topics = lda_model.show_topics(num_words=5) topics_words = [word for topic in year_topics for word in topic[1].split(" + ")] wordcloud = WordCloud(width=800, height=400, background_color='white', colormap=colorpalette).generate(" ".join(topics_words)) st.subheader(f"Word Cloud for Year {year}") st.image(wordcloud.to_image()) if __name__ == "__main__": main()