Spaces:

eleeleai
/

ircdl_data

Runtime error

App Files Files Community

Eleonora Bernasconi commited on Sep 25, 2023

Commit

ba3cdcb

1 Parent(s): a1a8bdc

update

Browse files

Files changed (1) hide show

app.py +138 -58

app.py CHANGED Viewed

@@ -1,66 +1,146 @@
 import streamlit as st
-import pandas as pd
-import scholarly
-st.title("CSV Data Viewer")
-def load_data():
-    data = pd.read_csv("data.csv", sep=";", usecols=range(10), encoding="latin1")
-    return data
-data = load_data()
-# Display the data
-st.write("Data from CSV:")
-st.write(data)
-cit_array = []
-count = 0
-st.write(count)
-# Iterate over rows and update 'doi' column if necessary
-for index, row in data.iterrows():
-    doi = row['doi']
-    title = row['title']
-    # If 'doi' is None, attempt to get DOI from title
-    if pd.isnull(doi):
-        doi = scholarly.get_doi_from_title(title)
-        # Update the DataFrame with the retrieved DOI
-        if doi:
-            data.at[index, 'doi'] = doi
-# Display the updated data table
-st.write("Data with DOI")
-st.write(data)
-# Loop over DOIs and retrieve citation counts
-for index, row in data.iterrows():
-    doi = row['doi']
-    if doi:
-        citation_count = scholarly.get_citation_count(doi)
-        if citation_count != None:
-            cit_array.append(citation_count)
-            st.text(f"DOI: {doi}, Citation Count: {citation_count}")
-            count += 1
-        else:
-            # Handle cases where DOI is None (e.g., bytitle lookup)
-            title = row['title']
-            doi_bytitle = scholarly.get_doi_from_title(str(title))
-            citation_count_title = scholarly.get_citation_count(doi_bytitle)
-            if citation_count_title != None:
-                count += 1
-                st.text(f"DOI from Title: {title}, Citation Count: {citation_count_title}")
-            cit_array.append(citation_count_title)
-# Add the citation count column to the DataFrame
-data['Citation Count'] = cit_array
-st.write(data)
-if not data.empty:
-    st.download_button(
-        label="Download Filtered Data as CSV",
-        data=data.to_csv(index=False).encode(),
-        file_name="filtered_data.csv",
-        key="download_filtered_data",
-    )

 import streamlit as st
+def knowledge_extraction():
+    import streamlit as st
+    import pandas as pd
+    import scholarly
+    st.title("CSV Data Viewer")
+    def load_data():
+        data = pd.read_csv("data.csv", sep=";", usecols=range(10), encoding="latin1")
+        return data
+    data = load_data()
+    # Step 1: Display the data retrieved from DBLP
+    step_1 = st.sidebar.checkbox("1 - Display the data retrieved from DBLP (Digital Bibliography & Library Project)")
+    if step_1:
+        st.write("Data from DBLP:")
+        st.write(data)
+    # Step 2: Iterate over rows and update 'doi' column if necessary
+    step_2 = st.sidebar.checkbox("2 - Iterate over rows and update 'doi' column if necessary")
+    if step_2:
+        cit_array = []
+        count = 0
+        st.write(count)
+        # Iterate over rows and update 'doi' column if necessary
+        for index, row in data.iterrows():
+            doi = row['doi']
+            title = row['title']
+            # If 'doi' is None, attempt to get DOI from title
+            if pd.isnull(doi):
+                doi = scholarly.get_doi_from_title(title)
+                # Update the DataFrame with the retrieved DOI
+                if doi:
+                    data.at[index, 'doi'] = doi
+        # Display the updated data table
+        st.write("Data with DOI")
+        st.write(data)
+    # Step 3: Loop over DOIs and retrieve citation counts
+    step_3 = st.sidebar.checkbox("3 - Loop over DOIs and retrieve citation counts")
+    if step_3:
+        cit_array = []
+        count = 0
+        # Loop over DOIs and retrieve citation counts
+        for index, row in data.iterrows():
+            doi = row['doi']
+            if doi:
+                citation_count = scholarly.get_citation_count(doi)
+                if citation_count is not None:
+                    cit_array.append(citation_count)
+                    st.text(f"DOI: {doi}, Citation Count: {citation_count}")
+                    count += 1
+                else:
+                    # Handle cases where DOI is None (e.g., bytitle lookup)
+                    title = row['title']
+                    doi_bytitle = scholarly.get_doi_from_title(str(title))
+                    citation_count_title = scholarly.get_citation_count(doi_bytitle)
+                    if citation_count_title is not None:
+                        count += 1
+                        st.text(f"DOI from Title: {title}, Citation Count: {citation_count_title}")
+                    cit_array.append(citation_count_title)
+        # Add the citation count column to the DataFrame
+        data['Citation Count'] = cit_array
+        st.write(data)
+    # Step 4: Download Filtered Data as CSV
+    if not data.empty and step_3:
+        st.download_button(
+            label="Download Filtered Data as CSV",
+            data=data.to_csv(index=False).encode(),
+            file_name="filtered_data.csv",
+            key="download_filtered_data",
+        )
+def analysis():
+    import streamlit as st
+    import pandas as pd
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+    st.write("# Analysis of knowledge")
+    def load_data():
+        data = pd.read_csv("output/output_crawled_data.csv", sep=",", encoding="latin1")
+        return data
+    data = load_data()
+    # Step 1: Display the data retrieved from DBLP
+    step_1 = st.sidebar.checkbox("1 - Display the crawled data")
+    if step_1:
+        st.write("Crawed data:")
+        st.write(data)
+    # Raggruppa i dati per autore e somma le citazioni per ciascun autore
+    authors_citation_counts = data.groupby('authors')['Citation Count'].sum().reset_index()
+    # Ordina gli autori per citazioni decrescenti
+    top_authors = authors_citation_counts.sort_values(by='Citation Count', ascending=False)
+    # Trova la lista di autori con il massimo numero di citazioni
+    max_citations = top_authors.iloc[0]['Citation Count']
+    top_citation_authors = top_authors[top_authors['Citation Count'] == max_citations]['authors']
+    # Trova la lista dei paper più citati
+    top_papers = data.sort_values(by='Citation Count', ascending=False).head(10)
+    # Trova gli anni in cui i paper hanno ottenuto più citazioni
+    years_with_most_citations = data.groupby('year')['Citation Count'].sum().reset_index()
+    years_with_most_citations = years_with_most_citations.sort_values(by='Citation Count', ascending=False).head(5)
+    # Visualizza i risultati
+    st.write("Lista di autori con il massimo numero di citazioni:")
+    st.write(top_citation_authors.to_string(index=False))
+    st.write("\nLista dei paper più citati:")
+    st.write(top_papers[['title', 'Citation Count']].to_string(index=False))
+    st.write("\nAnni in cui i paper hanno ottenuto più citazioni:")
+    st.write(years_with_most_citations.to_string(index=False))
+def intro():
+    import streamlit as st
+    st.write("# IRCDL Conference: A Two-Decade Bibliometric Journey Through Digital Libraries Research")
+    st.sidebar.success("Select a phase")
+    st.markdown(
+        """
+        IRCDL site: https://ircdl2024.dei.unipd.it/
+        """
+    )
+page_names_to_funcs = {
+    "Welcome": intro,
+    "Knowledge Extraction": knowledge_extraction,
+    "Analysis": analysis,
+}
+demo_name = st.sidebar.selectbox("Choose a phase", page_names_to_funcs.keys())
+page_names_to_funcs[demo_name]()