Spaces:

eleeleai
/

ircdl_data

Runtime error

App Files Files Community

Eleonora Bernasconi commited on Sep 26, 2023

Commit

df6bc1d

1 Parent(s): ea361ad

up

Browse files

Files changed (5) hide show

__pycache__/analysis.cpython-37.pyc +0 -0
__pycache__/intro.cpython-37.pyc +0 -0
analysis.py +223 -30
app.py +13 -1
intro.py +4 -2

__pycache__/analysis.cpython-37.pyc CHANGED Viewed

Binary files a/__pycache__/analysis.cpython-37.pyc and b/__pycache__/analysis.cpython-37.pyc differ

__pycache__/intro.cpython-37.pyc CHANGED Viewed

Binary files a/__pycache__/intro.cpython-37.pyc and b/__pycache__/intro.cpython-37.pyc differ

analysis.py CHANGED Viewed

@@ -1,56 +1,249 @@
 import streamlit as st
 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
-st.write("# Analysis of Knowledge")
 def load_data():
-    data = pd.read_csv("output/scholar_dblp_semantics.csv", sep=";", encoding="utf-8")
     return data
 def main():
     data = load_data()
     # Step 1: Display the data retrieved from DBLP
     step_1 = st.sidebar.checkbox("1 - Display the crawled data")
     if step_1:
         st.write("Crawled data:")
         st.write(data)
-    # Choose whether to calculate citations from 'cites_scholar', 'cites_semantic', or 'max_cit'
-    citation_source = st.selectbox("Select Citation Source", ['cites_scholar', 'cites_semantic', 'max_cit'])
-    # Calculate citations based on the selected source
-    if citation_source == 'max_cit':
-        data['max_cit'] = data[['cites_scholar', 'cites_semantic']].max(axis=1)
-        citation_source = 'max_cit'
-    # Group data by author and sum citations for each author
-    authors_citation_counts = data.groupby('Authors')[citation_source].sum().reset_index()
-    # Sort authors by descending citations
-    top_authors = authors_citation_counts.sort_values(by=citation_source, ascending=False)
-    # Separate authors and calculate citations for each author
-    author_citation_counts = pd.DataFrame(columns=['Authors', citation_source])
-    for _, row in top_authors.iterrows():
-        authors = row['Authors'].split(', ')
-        citations = row[citation_source]
-        for author in authors:
-            author_citation_counts = author_citation_counts.append({'Author': author, 'Total Citations': citations}, ignore_index=True)
-    # Group by author and sum the citations
-    author_citation_totals = author_citation_counts.groupby('Author')['Total Citations'].sum().reset_index()
-    # Display the results
-    st.write("Total Citations for Each Author:")
-    st.dataframe(author_citation_totals)
-    # Find the years in which papers received the most citations
-    st.write("\nYears in which papers received the most citations:")
-    years_with_most_citations = data.groupby('Year')[citation_source].sum().reset_index()
-    years_with_most_citations = years_with_most_citations.sort_values(by=citation_source, ascending=False).head(5)
-    st.dataframe(years_with_most_citations)

 import streamlit as st
 import pandas as pd
 import matplotlib.pyplot as plt
+from wordcloud import WordCloud
 import seaborn as sns
+import gensim
+from gensim import corpora
+from gensim.models.ldamodel import LdaModel
+import pyLDAvis.gensim_models as gensimvis
+import string
+import pyLDAvis
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.stem import WordNetLemmatizer
+import plotly.express as px
+import plotly.graph_objects as go
+import matplotlib as mpl
+import numpy as np
 def load_data():
+    data = pd.read_csv("output/scholar_dblp_semantics.csv",
+                       sep=";", encoding="utf-8")
     return data
+def preprocess_text(text):
+    # Convert text to lowercase
+    text = text.lower()
+    # Tokenize the text into words
+    words = word_tokenize(text)
+    # Remove stopwords and punctuation
+    stop_words = set(stopwords.words('english'))
+    words = [
+        word for word in words if word not in stop_words and word not in string.punctuation]
+    # Lemmatize words
+    lemmatizer = WordNetLemmatizer()
+    words = [lemmatizer.lemmatize(word) for word in words]
+    # Remove "'s" and "'nt" from words
+    words = [word.replace("'s", "").replace("n't", "").replace(
+        "'d", "").replace("'m", "").replace('"', "") for word in words]
+    return words
+def train_lda_model(corpus, dictionary, num_topics, passes=15):
+    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=passes, alpha='auto')
+    return lda_model
 def main():
+    st.write("# Analysis of Knowledge")
     data = load_data()
     # Step 1: Display the data retrieved from DBLP
     step_1 = st.sidebar.checkbox("1 - Display the crawled data")
+    # Step 2: Select Citation Source
+    step_2 = st.sidebar.checkbox("2 - Select Citation Source")
+    # Step 3: Elaborate Analysis
+    step_3 = st.sidebar.checkbox("3 - Elaborate Analysis")
+    # Step 4: Analyze Topics
+    step_4 = st.sidebar.checkbox("4 - Analyze Topics")
     if step_1:
         st.write("Crawled data:")
         st.write(data)
+    if step_2:
+        # Choose whether to calculate citations from 'cites_scholar', 'cites_semantic', or 'max_cit'
+        citation_source = st.selectbox("Select Citation Source", ['cites_scholar', 'cites_semantic', 'max_cit'])
+        # Calculate citations based on the selected source
+        if citation_source == 'max_cit':
+            data['max_cit'] = data[['cites_scholar','cites_semantic']].max(axis=1)
+            citation_source = 'max_cit'
+    if step_3:
+        # Group data by author and sum citations for each author
+        authors_citation_counts = data.groupby(
+            'Authors')[citation_source].sum().reset_index()
+        # Sort authors by descending citations
+        top_authors = authors_citation_counts.sort_values(
+            by=citation_source, ascending=False)
+        # Separate authors and calculate citations for each author
+        author_citation_counts = pd.DataFrame(columns=['Authors', citation_source])
+        for _, row in top_authors.iterrows():
+            authors = row['Authors'].split(', ')
+            citations = row[citation_source]
+            for author in authors:
+                author_citation_counts = author_citation_counts.append(
+                    {'Author': author, 'Total Citations': citations}, ignore_index=True)
+        # Group by author and sum the citations
+        author_citation_totals = author_citation_counts.groupby(
+            'Author')['Total Citations'].sum().reset_index()
+        # Display the results
+        st.write("Total Citations for Each Author:")
+        st.dataframe(author_citation_totals)
+        # Find the years in which papers received the most citations
+        st.write("\nYears in which papers received the most citations:")
+        years_with_most_citations = data.groupby(
+            'Year')[citation_source].sum().reset_index()
+        years_with_most_citations = years_with_most_citations.sort_values(
+            by=citation_source, ascending=False).head(5)
+        st.write(years_with_most_citations)
+    if step_4:
+        # Add a sidebar to your Streamlit app
+        st.sidebar.title("Topic Modeling Settings")
+        # Create a number input for selecting the number of topics
+        num_topics = st.sidebar.number_input(
+            "Number of Topics", min_value=1, value=5, step=1)
+        # Display the selected number of topics
+        st.write(f"You have selected {num_topics} topics for topic modeling.")
+        # Create a list of titles for topic analysis
+        titles = data['Title'].tolist()
+        # Preprocess titles (tokenization, stop words removal, etc.)
+        preprocessed_titles = [preprocess_text(title) for title in titles]
+        # Create a dictionary and a corpus
+        dictionary = corpora.Dictionary(preprocessed_titles)
+        corpus = [dictionary.doc2bow(title_tokens) for title_tokens in preprocessed_titles]
+        # Build an LDA model
+        lda_model = LdaModel(corpus=corpus, id2word=dictionary,num_topics=num_topics, passes=15, alpha='auto')
+        # Visualizza i topic con PyLDAvis
+        vis_data = pyLDAvis.gensim_models.prepare(
+            lda_model, corpus, dictionary)
+        html_string = pyLDAvis.prepared_data_to_html(vis_data)
+        st.subheader("LDAVis Topic Visualization")
+        st.markdown('<div style="padding: 20px 5px;"></div>', unsafe_allow_html=True)
+        st.components.v1.html(html_string, height=1100, width=1400)
+        # Group data by year and train an LDA model for each year
+        years = sorted(data['Year'].unique())  # Sort the years in ascending order
+        lda_models = {}
+        for year in years:
+            year_data = data[data['Year'] == year]
+            year_titles = year_data['Title'].tolist()
+            preprocessed_year_titles = [
+                preprocess_text(title) for title in year_titles]
+            year_corpus = [dictionary.doc2bow(
+                title_tokens) for title_tokens in preprocessed_year_titles]
+            # Train an LDA model for the current year
+            lda_model = train_lda_model(
+                year_corpus, dictionary, num_topics=num_topics, passes=15)
+            lda_models[year] = lda_model
+        # Crea un DataFrame con i dati aggregati dei topic per ogni anno
+        topic_data = {}
+        for year, lda_model in lda_models.items():
+            year_topics = lda_model.show_topics(num_words=5)
+            topics = [f"Topic {topic[0]}" for topic in year_topics]
+            topic_weights = [sum(float(w.split('*')[0])
+                                 for w in topic[1].split(" + ")) for topic in year_topics]
+            topic_data[year] = dict(zip(topics, topic_weights))
+        df = pd.DataFrame(topic_data).T
+        # Create a list of years for x-axis
+        years_list = [str(year) for year in df.index]
+        colorpalette = 'gnuplot'
+        # Generate a color palette using Seaborn
+        rgb_colors = sns.color_palette(colorpalette, num_topics)
+        # Convert RGB colors to RGB hex codes
+        topic_colors = list(map(mpl.colors.rgb2hex, rgb_colors))
+        # Crea un DataFrame con i dati aggregati dei topic per ogni anno
+        topic_data = {}
+        for year, lda_model in lda_models.items():
+            year_topics = lda_model.show_topics(num_words=5)
+            topics = [f"Topic {topic[0]}" for topic in year_topics]
+            topic_weights = [sum(float(w.split('*')[0])
+                                for w in topic[1].split(" + ")) for topic in year_topics]
+            topic_data[year] = dict(zip(topics, topic_weights))
+        df = pd.DataFrame(topic_data).T
+        # Calcola la tendenza dei pesi dei topic nel tempo
+        trend_data = {}
+        for topic in topics:
+            topic_values = df[topic].values
+            trend = "Increasing" if topic_values[-1] > topic_values[0] else "Decreasing"
+            trend_data[topic] = trend
+        # Crea un DataFrame con le tendenze dei topic
+        # trend_df = pd.DataFrame(trend_data, index=['Trend']).T
+        # # Visualizza le tendenze dei topic
+        # st.write("Trends of Topics Over Years:")
+        # st.write(trend_df)
+        # Create a stacked bar chart
+        st.write("# Evolution of Topics Over Years")
+        # Create an interactive Plotly line chart for all topics with consistent colors
+        fig = go.Figure()
+        for i, topic in enumerate(topics):
+            fig.add_trace(go.Scatter(x=years_list, y=df[topic], mode='lines',
+                                    name=f"Topic {i+1}", line=dict(color=topic_colors[i])))
+            # Calcola la linea di tendenza (ad esempio, regressione lineare) per il topic
+            years_list_numeric = [int(year) for year in years_list]
+            z = np.polyfit(years_list_numeric, df[topic], 1)
+            p = np.poly1d(z)
+            fig.add_trace(go.Scatter(x=years_list_numeric, y=p(years_list_numeric),
+                                    mode='lines', name=f"Trend {i+1}: {trend_data[topic]}",
+                                    line=dict(color=topic_colors[i], dash='dash')))
+        # Imposta il layout del grafico
+        fig.update_layout(xaxis_title='Year', yaxis_title='Topic Weight',
+                        title='Evolution of Topics Over Years', xaxis=dict(type='category'))
+        # Visualizza il grafico
+        st.plotly_chart(fig)
+        # Visualizza le Word Cloud per ciascun anno
+        for i, (year, lda_model) in enumerate(lda_models.items()):
+            # Crea e visualizza la Word Cloud
+            year_topics = lda_model.show_topics(num_words=5)
+            topics_words = [word for topic in year_topics for word in topic[1].split(" + ")]
+            wordcloud = WordCloud(width=800, height=400, background_color='white', colormap=colorpalette).generate(" ".join(topics_words))
+            st.subheader(f"Word Cloud for Year {year}")
+            st.image(wordcloud.to_image())
+if __name__ == "__main__":
+    main()

app.py CHANGED Viewed

@@ -4,6 +4,17 @@ import knowledge_extraction
 import analysis
 import merge
 page_names_to_funcs = {
     "Welcome": intro.intro,
@@ -12,5 +23,6 @@ page_names_to_funcs = {
     "Analysis": analysis.main,
 }
-demo_name = st.sidebar.selectbox("Choose a phase", page_names_to_funcs.keys())
 page_names_to_funcs[demo_name]()

 import analysis
 import merge
+st.set_page_config(
+        page_title="IRCDL Conference",
+        page_icon="📖",
+        layout="wide",
+        initial_sidebar_state="expanded",
+        menu_items={
+            'Get Help': 'https://ircdl2024.dei.unipd.it/',
+            'Report a bug': "https://ircdl2024.dei.unipd.it/",
+            'About': "# IRCDL Conference: A Two-Decade Bibliometric Journey Through Digital Libraries Research"
+        }
+    )
 page_names_to_funcs = {
     "Welcome": intro.intro,
     "Analysis": analysis.main,
 }
+demo_name = st.sidebar.selectbox("IRCDL Conference: A Two-Decade Bibliometric Journey Through Digital Libraries Research", page_names_to_funcs.keys())
 page_names_to_funcs[demo_name]()

intro.py CHANGED Viewed

@@ -1,7 +1,7 @@
-import streamlit
 def intro():
-    import streamlit as st
     st.write("# IRCDL Conference: A Two-Decade Bibliometric Journey Through Digital Libraries Research")
     st.sidebar.success("Select a phase")
@@ -11,3 +11,5 @@ def intro():
         IRCDL site: https://ircdl2024.dei.unipd.it/
         """
     )

+import streamlit as st
 def intro():
+    import streamlit as st
     st.write("# IRCDL Conference: A Two-Decade Bibliometric Journey Through Digital Libraries Research")
     st.sidebar.success("Select a phase")
         IRCDL site: https://ircdl2024.dei.unipd.it/
         """
     )