Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| from wordcloud import WordCloud | |
| import seaborn as sns | |
| import gensim | |
| from gensim import corpora | |
| from gensim.models.ldamodel import LdaModel | |
| import pyLDAvis.gensim_models as gensimvis | |
| import string | |
| import pyLDAvis | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| from nltk.stem import WordNetLemmatizer | |
| import plotly.graph_objects as go | |
| import matplotlib as mpl | |
| import numpy as np | |
| def load_data(): | |
| data = pd.read_csv("output/scholar_dblp_semantics.csv", | |
| sep=";", encoding="utf-8") | |
| return data | |
| def preprocess_text(text): | |
| # Convert text to lowercase | |
| text = text.lower() | |
| # Tokenize the text into words | |
| words = word_tokenize(text) | |
| # Remove stopwords and punctuation | |
| stop_words = set(stopwords.words('english')) | |
| words = [ | |
| word for word in words if word not in stop_words and word not in string.punctuation] | |
| # Lemmatize words | |
| lemmatizer = WordNetLemmatizer() | |
| words = [lemmatizer.lemmatize(word) for word in words] | |
| # Remove "'s" and "'nt" from words | |
| words = [word.replace("'s", "").replace("n't", "").replace( | |
| "'d", "").replace("'m", "").replace('"', "") for word in words] | |
| return words | |
| def train_lda_model(corpus, dictionary, num_topics, passes=15): | |
| lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=passes, alpha='auto') | |
| return lda_model | |
| def doi_def(row): | |
| if row['DOI'] != 'None': | |
| return row['DOI'] | |
| elif row['doi_scholar'] != 'None': | |
| return row['doi_scholar'] | |
| elif row['doi_semantic'] != 'None': | |
| return row['doi_semantic'] | |
| else: | |
| return 'None' | |
| def main(): | |
| # nltk.download('punkt') | |
| # nltk.download('stopwords') | |
| # print(nltk.data.path) | |
| nltk.data.path.append("nltk_data") | |
| st.write("# Analysis of Knowledge") | |
| data = load_data() | |
| # Step 1: Display the data retrieved from DBLP | |
| step_1 = st.sidebar.checkbox("1 - Display the crawled data") | |
| # Step 2: Select Citation Source | |
| step_2 = st.sidebar.checkbox("2 - Select Citation Source") | |
| # Step 3: Elaborate Analysis | |
| step_3 = st.sidebar.checkbox("3 - Elaborate Analysis") | |
| # Step 4: Analyze Topics | |
| step_4 = st.sidebar.checkbox("4 - Analyze Topics") | |
| if step_1: | |
| st.write("Crawled data:") | |
| data['doi_def'] = data.apply(doi_def, axis=1) | |
| count_none = (data['DOI'] == 'None').sum() | |
| total_values = len(data['doi_def']) | |
| data.to_csv('output/doi_def_scholar_dblp_semantics.csv', index=False, sep=';' , encoding='utf-8') | |
| st.write("Number of values in DOI column equal to None / Total values: " + str(count_none) + "/" + str(total_values)) | |
| st.write(data) | |
| if step_2: | |
| # Choose whether to calculate citations from 'cites_scholar', 'cites_semantic', or 'max_cit' | |
| citation_source = st.selectbox("Select Citation Source", ['cites_scholar', 'cites_semantic', 'max_cit']) | |
| # Calculate citations based on the selected source | |
| if citation_source == 'max_cit': | |
| data['max_cit'] = data[['cites_scholar','cites_semantic']].max(axis=1) | |
| citation_source = 'max_cit' | |
| if step_3: | |
| # Group data by author and sum citations for each author | |
| authors_citation_counts = data.groupby( | |
| 'Authors')[citation_source].sum().reset_index() | |
| # Sort authors by descending citations | |
| top_authors = authors_citation_counts.sort_values( | |
| by=citation_source, ascending=False) | |
| # Separate authors and calculate citations for each author | |
| author_citation_counts = pd.DataFrame(columns=['Authors', citation_source]) | |
| rows_to_concat = [] | |
| for _, row in top_authors.iterrows(): | |
| authors = row['Authors'].split(', ') | |
| citations = row[citation_source] | |
| for author in authors: | |
| rows_to_concat.append({'Author': author, 'Total Citations': citations}) | |
| author_citation_counts = pd.concat([author_citation_counts, pd.DataFrame(rows_to_concat)], ignore_index=True) | |
| # Group by author and sum the citations | |
| author_citation_totals = author_citation_counts.groupby( | |
| 'Author')['Total Citations'].sum().reset_index() | |
| # Display the results | |
| st.write("Total Citations for Each Author:") | |
| st.dataframe(author_citation_totals) | |
| # Find the years in which papers received the most citations | |
| st.write("\nYears in which papers received the most citations:") | |
| years_with_most_citations = data.groupby( | |
| 'Year')[citation_source].sum().reset_index() | |
| years_with_most_citations = years_with_most_citations.sort_values( | |
| by=citation_source, ascending=False).head(5) | |
| st.write(years_with_most_citations) | |
| if step_4: | |
| # Add a sidebar to your Streamlit app | |
| st.sidebar.title("Topic Modeling Settings") | |
| # Create a number input for selecting the number of topics | |
| num_topics = st.sidebar.number_input( | |
| "Number of Topics", min_value=1, value=5, step=1) | |
| # Display the selected number of topics | |
| st.write(f"You have selected {num_topics} topics for topic modeling.") | |
| # Create a list of titles for topic analysis | |
| titles = data['Title'].tolist() | |
| # Preprocess titles (tokenization, stop words removal, etc.) | |
| preprocessed_titles = [preprocess_text(title) for title in titles] | |
| # Create a dictionary and a corpus | |
| dictionary = corpora.Dictionary(preprocessed_titles) | |
| corpus = [dictionary.doc2bow(title_tokens) for title_tokens in preprocessed_titles] | |
| # Build an LDA model | |
| lda_model = LdaModel(corpus=corpus, id2word=dictionary,num_topics=num_topics, passes=15, alpha='auto') | |
| # Visualizza i topic con PyLDAvis | |
| vis_data = pyLDAvis.gensim_models.prepare( | |
| lda_model, corpus, dictionary) | |
| html_string = pyLDAvis.prepared_data_to_html(vis_data) | |
| st.subheader("LDAVis Topic Visualization") | |
| st.markdown('<div style="padding: 20px 5px;"></div>', unsafe_allow_html=True) | |
| st.components.v1.html(html_string, height=1100, width=1400) | |
| # Group data by year and train an LDA model for each year | |
| years = sorted(data['Year'].unique()) # Sort the years in ascending order | |
| lda_models = {} | |
| for year in years: | |
| year_data = data[data['Year'] == year] | |
| year_titles = year_data['Title'].tolist() | |
| preprocessed_year_titles = [ | |
| preprocess_text(title) for title in year_titles] | |
| year_corpus = [dictionary.doc2bow( | |
| title_tokens) for title_tokens in preprocessed_year_titles] | |
| # Train an LDA model for the current year | |
| lda_model = train_lda_model( | |
| year_corpus, dictionary, num_topics=num_topics, passes=15) | |
| lda_models[year] = lda_model | |
| # Crea un DataFrame con i dati aggregati dei topic per ogni anno | |
| topic_data = {} | |
| for year, lda_model in lda_models.items(): | |
| year_topics = lda_model.show_topics(num_words=num_topics) | |
| topics = [f"Topic {topic[0]}" for topic in year_topics] | |
| topic_weights = [sum(float(w.split('*')[0]) | |
| for w in topic[1].split(" + ")) for topic in year_topics] | |
| topic_data[year] = dict(zip(topics, topic_weights)) | |
| df = pd.DataFrame(topic_data).T | |
| # Create a list of years for x-axis | |
| years_list = [str(year) for year in df.index] | |
| colorpalette = 'gnuplot' | |
| # Generate a color palette using Seaborn | |
| rgb_colors = sns.color_palette(colorpalette, num_topics) | |
| # Convert RGB colors to RGB hex codes | |
| topic_colors = list(map(mpl.colors.rgb2hex, rgb_colors)) | |
| # Calcola la tendenza dei pesi dei topic nel tempo | |
| trend_data = {} | |
| for topic in topics: | |
| topic_values = df[topic].values | |
| trend = "Increasing" if topic_values[-1] > topic_values[0] else "Decreasing" | |
| trend_data[topic] = trend | |
| # Crea un DataFrame con le tendenze dei topic | |
| # trend_df = pd.DataFrame(trend_data, index=['Trend']).T | |
| # # Visualizza le tendenze dei topic | |
| # st.write("Trends of Topics Over Years:") | |
| # st.write(trend_df) | |
| # Create a stacked bar chart | |
| st.write("# Evolution of Topics Over Years") | |
| # Create an interactive Plotly line chart for all topics with consistent colors | |
| fig = go.Figure() | |
| for i, topic in enumerate(topics): | |
| fig.add_trace(go.Scatter(x=years_list, y=df[topic], mode='lines', | |
| name=f"Topic {i+1}", line=dict(color=topic_colors[i]))) | |
| # Calcola la linea di tendenza (ad esempio, regressione lineare) per il topic | |
| years_list_numeric = [int(year) for year in years_list] | |
| z = np.polyfit(years_list_numeric, df[topic], 1) | |
| p = np.poly1d(z) | |
| fig.add_trace(go.Scatter(x=years_list_numeric, y=p(years_list_numeric), | |
| mode='lines', name=f"Trend {i+1}: {trend_data[topic]}", | |
| line=dict(color=topic_colors[i], dash='dash'))) | |
| # Imposta il layout del grafico | |
| fig.update_layout(xaxis_title='Year', yaxis_title='Topic Weight', | |
| title='Evolution of Topics Over Years', xaxis=dict(type='category')) | |
| # Visualizza il grafico | |
| st.plotly_chart(fig) | |
| # Visualizza le Word Cloud per ciascun anno | |
| for i, (year, lda_model) in enumerate(lda_models.items()): | |
| # Crea e visualizza la Word Cloud | |
| year_topics = lda_model.show_topics(num_words=5) | |
| topics_words = [word for topic in year_topics for word in topic[1].split(" + ")] | |
| wordcloud = WordCloud(width=800, height=400, background_color='white', colormap=colorpalette).generate(" ".join(topics_words)) | |
| st.subheader(f"Word Cloud for Year {year}") | |
| st.image(wordcloud.to_image()) | |
| if __name__ == "__main__": | |
| main() | |