Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import xml.etree.ElementTree as ET | |
| import csv | |
| def knowledge_extraction(): | |
| import streamlit as st | |
| import pandas as pd | |
| import scholarly | |
| st.title("CSV Data Viewer") | |
| def load_data(): | |
| data = pd.read_csv("output/dblp.csv", sep=";", encoding="latin1") | |
| return data | |
| data = load_data() | |
| if st.button("generate dblp csv"): | |
| # Carica il file XML | |
| tree = ET.parse('output/dblp.xml') | |
| root = tree.getroot() | |
| # Crea un file CSV per scrivere i dati | |
| with open('output/dblp.csv', 'w', newline='', encoding='utf-8') as csvfile: | |
| csvwriter = csv.writer(csvfile, delimiter=';') | |
| # Scrivi l'intestazione del CSV | |
| csvwriter.writerow(['DBLP', 'Authors', 'Title', 'Year', 'DOI', 'URL']) | |
| # Estrai i dati XML e scrivili nel CSV | |
| for hit in root.findall('.//hit'): | |
| authors = ', '.join(author.text for author in hit.findall('.//authors/author')) | |
| title = hit.find('.//title').text | |
| year = hit.find('.//year').text | |
| doi_element = hit.find('.//doi') | |
| doi = doi_element.text if doi_element is not None else 'None' | |
| url = hit.find('.//url').text | |
| id = hit.attrib['id'] | |
| # Scrivi i dati nel file CSV | |
| csvwriter.writerow([id, authors, title, year, doi, url]) | |
| print("Conversione XML in CSV completata.") | |
| count_none = (data['DOI'] == 'None').sum() | |
| total_values = len(data['DOI']) | |
| st.write("Number of values in DOI column equal to None / Total values: " + str(count_none) + "/" + str(total_values)) | |
| # Step 1: Display the data retrieved from DBLP | |
| step_1 = st.sidebar.checkbox("1 - Display the data retrieved from DBLP (Digital Bibliography & Library Project)") | |
| if step_1: | |
| st.write("Data from DBLP:") | |
| st.write(data) | |
| # Step 2: Iterate over rows and update 'doi' column if necessary | |
| step_2 = st.sidebar.checkbox("2 - Iterate over rows and update 'doi' column if necessary") | |
| if step_2: | |
| cit_array = [] | |
| count = 0 | |
| st.write(count) | |
| # Iterate over rows and update 'doi' column if necessary | |
| for index, row in data.iterrows(): | |
| doi = row['DOI'] | |
| title = row['Title'] | |
| # If 'doi' is None, attempt to get DOI from title | |
| if doi == 'None': | |
| doi = scholarly.get_doi_from_title(title) | |
| # Update the DataFrame with the retrieved DOI | |
| if doi: | |
| data.at[index, 'DOI'] = doi | |
| # Display the updated data table | |
| st.write("Data with DOI") | |
| count_none = (data['DOI'] == 'None').sum() | |
| total_values = len(data['DOI']) | |
| st.write("Number of values in DOI column equal to None / Total values: " + str(count_none) + "/" + str(total_values)) | |
| st.write(data) | |
| # Step 3: Loop over DOIs and retrieve citation counts | |
| step_3 = st.sidebar.checkbox("3 - Loop over DOIs and retrieve citation counts") | |
| if step_3: | |
| cit_array = [] | |
| count = 0 | |
| # Loop over DOIs and retrieve citation counts | |
| for index, row in data.iterrows(): | |
| doi = row['DOI'] | |
| if doi: | |
| citation_count = scholarly.get_citation_count(doi) | |
| if citation_count is not None: | |
| cit_array.append(citation_count) | |
| st.text(f"DOI: {doi}, Citation Count: {citation_count}") | |
| count += 1 | |
| else: | |
| # Handle cases where DOI is None (e.g., bytitle lookup) | |
| title = row['Title'] | |
| doi_bytitle = scholarly.get_doi_from_title(str(title)) | |
| citation_count_title = scholarly.get_citation_count(doi_bytitle) | |
| if citation_count_title is not None: | |
| count += 1 | |
| st.text(f"DOI from Title: {title}, Citation Count: {citation_count_title}") | |
| cit_array.append(citation_count_title) | |
| # Add the citation count column to the DataFrame | |
| data['Citation Count'] = cit_array | |
| st.write(data) | |
| output_path = "output/crawler_doi_citation.csv" | |
| data.to_csv(output_path, sep=';', index=False) | |
| # Step 4: Download Filtered Data as CSV | |
| if not data.empty and step_3: | |
| st.download_button( | |
| label="Download Filtered Data as CSV", | |
| data=data.to_csv(index=False).encode(), | |
| file_name="crawler_doi_citation.csv", | |
| key="crawler_doi_citation", | |
| ) | |