Spaces:

eleeleai
/

ircdl_data

Runtime error

ircdl_data / analysis.py

Eleonora Bernasconi

updates

ce81144 over 2 years ago

10.3 kB

	import streamlit as st
	import pandas as pd
	import matplotlib.pyplot as plt
	from wordcloud import WordCloud
	import seaborn as sns
	import gensim
	from gensim import corpora
	from gensim.models.ldamodel import LdaModel
	import pyLDAvis.gensim_models as gensimvis
	import string
	import pyLDAvis
	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	from nltk.stem import WordNetLemmatizer
	import plotly.graph_objects as go
	import matplotlib as mpl
	import numpy as np


	def load_data():
	data = pd.read_csv("output/scholar_dblp_semantics.csv",
	sep=";", encoding="utf-8")
	return data


	def preprocess_text(text):
	# Convert text to lowercase
	text = text.lower()

	# Tokenize the text into words
	words = word_tokenize(text)

	# Remove stopwords and punctuation
	stop_words = set(stopwords.words('english'))
	words = [
	word for word in words if word not in stop_words and word not in string.punctuation]

	# Lemmatize words
	lemmatizer = WordNetLemmatizer()
	words = [lemmatizer.lemmatize(word) for word in words]

	# Remove "'s" and "'nt" from words
	words = [word.replace("'s", "").replace("n't", "").replace(
	"'d", "").replace("'m", "").replace('"', "") for word in words]

	return words


	def train_lda_model(corpus, dictionary, num_topics, passes=15):
	lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=passes, alpha='auto')
	return lda_model


	def doi_def(row):
	if row['DOI'] != 'None':
	return row['DOI']
	elif row['doi_scholar'] != 'None':
	return row['doi_scholar']
	elif row['doi_semantic'] != 'None':
	return row['doi_semantic']
	else:
	return 'None'

	def main():

	# nltk.download('punkt')
	# nltk.download('stopwords')
	# print(nltk.data.path)
	nltk.data.path.append("nltk_data")
	st.write("# Analysis of Knowledge")

	data = load_data()

	# Step 1: Display the data retrieved from DBLP
	step_1 = st.sidebar.checkbox("1 - Display the crawled data")
	# Step 2: Select Citation Source
	step_2 = st.sidebar.checkbox("2 - Select Citation Source")
	# Step 3: Elaborate Analysis
	step_3 = st.sidebar.checkbox("3 - Elaborate Analysis")
	# Step 4: Analyze Topics
	step_4 = st.sidebar.checkbox("4 - Analyze Topics")

	if step_1:
	st.write("Crawled data:")
	data['doi_def'] = data.apply(doi_def, axis=1)
	count_none = (data['DOI'] == 'None').sum()
	total_values = len(data['doi_def'])
	data.to_csv('output/doi_def_scholar_dblp_semantics.csv', index=False, sep=';' , encoding='utf-8')

	st.write("Number of values in DOI column equal to None / Total values: " + str(count_none) + "/" + str(total_values))
	st.write(data)

	if step_2:
	# Choose whether to calculate citations from 'cites_scholar', 'cites_semantic', or 'max_cit'
	citation_source = st.selectbox("Select Citation Source", ['cites_scholar', 'cites_semantic', 'max_cit'])

	# Calculate citations based on the selected source
	if citation_source == 'max_cit':
	data['max_cit'] = data[['cites_scholar','cites_semantic']].max(axis=1)
	citation_source = 'max_cit'

	if step_3:
	# Group data by author and sum citations for each author
	authors_citation_counts = data.groupby(
	'Authors')[citation_source].sum().reset_index()

	# Sort authors by descending citations
	top_authors = authors_citation_counts.sort_values(
	by=citation_source, ascending=False)

	# Separate authors and calculate citations for each author
	author_citation_counts = pd.DataFrame(columns=['Authors', citation_source])

	rows_to_concat = []

	for _, row in top_authors.iterrows():
	authors = row['Authors'].split(', ')
	citations = row[citation_source]

	for author in authors:
	rows_to_concat.append({'Author': author, 'Total Citations': citations})

	author_citation_counts = pd.concat([author_citation_counts, pd.DataFrame(rows_to_concat)], ignore_index=True)


	# Group by author and sum the citations
	author_citation_totals = author_citation_counts.groupby(
	'Author')['Total Citations'].sum().reset_index()

	# Display the results
	st.write("Total Citations for Each Author:")
	st.dataframe(author_citation_totals)

	# Find the years in which papers received the most citations
	st.write("\nYears in which papers received the most citations:")
	years_with_most_citations = data.groupby(
	'Year')[citation_source].sum().reset_index()
	years_with_most_citations = years_with_most_citations.sort_values(
	by=citation_source, ascending=False).head(5)
	st.write(years_with_most_citations)

	if step_4:
	# Add a sidebar to your Streamlit app
	st.sidebar.title("Topic Modeling Settings")

	# Create a number input for selecting the number of topics
	num_topics = st.sidebar.number_input(
	"Number of Topics", min_value=1, value=5, step=1)

	# Display the selected number of topics
	st.write(f"You have selected {num_topics} topics for topic modeling.")

	# Create a list of titles for topic analysis
	titles = data['Title'].tolist()

	# Preprocess titles (tokenization, stop words removal, etc.)
	preprocessed_titles = [preprocess_text(title) for title in titles]

	# Create a dictionary and a corpus
	dictionary = corpora.Dictionary(preprocessed_titles)
	corpus = [dictionary.doc2bow(title_tokens) for title_tokens in preprocessed_titles]

	# Build an LDA model
	lda_model = LdaModel(corpus=corpus, id2word=dictionary,num_topics=num_topics, passes=15, alpha='auto')

	# Visualizza i topic con PyLDAvis
	vis_data = pyLDAvis.gensim_models.prepare(
	lda_model, corpus, dictionary)
	html_string = pyLDAvis.prepared_data_to_html(vis_data)

	st.subheader("LDAVis Topic Visualization")
	st.markdown('<div style="padding: 20px 5px;"></div>', unsafe_allow_html=True)
	st.components.v1.html(html_string, height=1100, width=1400)

	# Group data by year and train an LDA model for each year
	years = sorted(data['Year'].unique()) # Sort the years in ascending order
	lda_models = {}

	for year in years:
	year_data = data[data['Year'] == year]
	year_titles = year_data['Title'].tolist()
	preprocessed_year_titles = [
	preprocess_text(title) for title in year_titles]
	year_corpus = [dictionary.doc2bow(
	title_tokens) for title_tokens in preprocessed_year_titles]

	# Train an LDA model for the current year
	lda_model = train_lda_model(
	year_corpus, dictionary, num_topics=num_topics, passes=15)
	lda_models[year] = lda_model

	# Crea un DataFrame con i dati aggregati dei topic per ogni anno
	topic_data = {}
	for year, lda_model in lda_models.items():
	year_topics = lda_model.show_topics(num_words=num_topics)
	topics = [f"Topic {topic[0]}" for topic in year_topics]
	topic_weights = [sum(float(w.split('*')[0])
	for w in topic[1].split(" + ")) for topic in year_topics]
	topic_data[year] = dict(zip(topics, topic_weights))

	df = pd.DataFrame(topic_data).T

	# Create a list of years for x-axis
	years_list = [str(year) for year in df.index]
	colorpalette = 'gnuplot'
	# Generate a color palette using Seaborn
	rgb_colors = sns.color_palette(colorpalette, num_topics)

	# Convert RGB colors to RGB hex codes
	topic_colors = list(map(mpl.colors.rgb2hex, rgb_colors))



	# Calcola la tendenza dei pesi dei topic nel tempo
	trend_data = {}
	for topic in topics:
	topic_values = df[topic].values
	trend = "Increasing" if topic_values[-1] > topic_values[0] else "Decreasing"
	trend_data[topic] = trend

	# Crea un DataFrame con le tendenze dei topic
	# trend_df = pd.DataFrame(trend_data, index=['Trend']).T

	# # Visualizza le tendenze dei topic
	# st.write("Trends of Topics Over Years:")
	# st.write(trend_df)

	# Create a stacked bar chart
	st.write("# Evolution of Topics Over Years")

	# Create an interactive Plotly line chart for all topics with consistent colors
	fig = go.Figure()

	for i, topic in enumerate(topics):
	fig.add_trace(go.Scatter(x=years_list, y=df[topic], mode='lines',
	name=f"Topic {i+1}", line=dict(color=topic_colors[i])))

	# Calcola la linea di tendenza (ad esempio, regressione lineare) per il topic
	years_list_numeric = [int(year) for year in years_list]

	z = np.polyfit(years_list_numeric, df[topic], 1)
	p = np.poly1d(z)
	fig.add_trace(go.Scatter(x=years_list_numeric, y=p(years_list_numeric),
	mode='lines', name=f"Trend {i+1}: {trend_data[topic]}",
	line=dict(color=topic_colors[i], dash='dash')))

	# Imposta il layout del grafico
	fig.update_layout(xaxis_title='Year', yaxis_title='Topic Weight',
	title='Evolution of Topics Over Years', xaxis=dict(type='category'))

	# Visualizza il grafico
	st.plotly_chart(fig)

	# Visualizza le Word Cloud per ciascun anno
	for i, (year, lda_model) in enumerate(lda_models.items()):
	# Crea e visualizza la Word Cloud
	year_topics = lda_model.show_topics(num_words=5)
	topics_words = [word for topic in year_topics for word in topic[1].split(" + ")]
	wordcloud = WordCloud(width=800, height=400, background_color='white', colormap=colorpalette).generate(" ".join(topics_words))
	st.subheader(f"Word Cloud for Year {year}")
	st.image(wordcloud.to_image())


	if __name__ == "__main__":
	main()