import streamlit as st import xml.etree.ElementTree as ET import csv def knowledge_extraction(): import streamlit as st import pandas as pd import scholarly st.title("CSV Data Viewer") def load_data(): data = pd.read_csv("output/dblp.csv", sep=";", encoding="latin1") return data data = load_data() if st.button("generate dblp csv"): # Carica il file XML tree = ET.parse('output/dblp.xml') root = tree.getroot() # Crea un file CSV per scrivere i dati with open('output/dblp.csv', 'w', newline='', encoding='utf-8') as csvfile: csvwriter = csv.writer(csvfile, delimiter=';') # Scrivi l'intestazione del CSV csvwriter.writerow(['DBLP', 'Authors', 'Title', 'Year', 'DOI', 'URL']) # Estrai i dati XML e scrivili nel CSV for hit in root.findall('.//hit'): authors = ', '.join(author.text for author in hit.findall('.//authors/author')) title = hit.find('.//title').text year = hit.find('.//year').text doi_element = hit.find('.//doi') doi = doi_element.text if doi_element is not None else 'None' url = hit.find('.//url').text id = hit.attrib['id'] # Scrivi i dati nel file CSV csvwriter.writerow([id, authors, title, year, doi, url]) print("Conversione XML in CSV completata.") count_none = (data['DOI'] == 'None').sum() total_values = len(data['DOI']) st.write("Number of values in DOI column equal to None / Total values: " + str(count_none) + "/" + str(total_values)) # Step 1: Display the data retrieved from DBLP step_1 = st.sidebar.checkbox("1 - Display the data retrieved from DBLP (Digital Bibliography & Library Project)") if step_1: st.write("Data from DBLP:") st.write(data) # Step 2: Iterate over rows and update 'doi' column if necessary step_2 = st.sidebar.checkbox("2 - Iterate over rows and update 'doi' column if necessary") if step_2: cit_array = [] count = 0 st.write(count) # Iterate over rows and update 'doi' column if necessary for index, row in data.iterrows(): doi = row['DOI'] title = row['Title'] # If 'doi' is None, attempt to get DOI from title if doi == 'None': doi = scholarly.get_doi_from_title(title) # Update the DataFrame with the retrieved DOI if doi: data.at[index, 'DOI'] = doi # Display the updated data table st.write("Data with DOI") count_none = (data['DOI'] == 'None').sum() total_values = len(data['DOI']) st.write("Number of values in DOI column equal to None / Total values: " + str(count_none) + "/" + str(total_values)) st.write(data) # Step 3: Loop over DOIs and retrieve citation counts step_3 = st.sidebar.checkbox("3 - Loop over DOIs and retrieve citation counts") if step_3: cit_array = [] count = 0 # Loop over DOIs and retrieve citation counts for index, row in data.iterrows(): doi = row['DOI'] if doi: citation_count = scholarly.get_citation_count(doi) if citation_count is not None: cit_array.append(citation_count) st.text(f"DOI: {doi}, Citation Count: {citation_count}") count += 1 else: # Handle cases where DOI is None (e.g., bytitle lookup) title = row['Title'] doi_bytitle = scholarly.get_doi_from_title(str(title)) citation_count_title = scholarly.get_citation_count(doi_bytitle) if citation_count_title is not None: count += 1 st.text(f"DOI from Title: {title}, Citation Count: {citation_count_title}") cit_array.append(citation_count_title) # Add the citation count column to the DataFrame data['Citation Count'] = cit_array st.write(data) output_path = "output/crawler_doi_citation.csv" data.to_csv(output_path, sep=';', index=False) # Step 4: Download Filtered Data as CSV if not data.empty and step_3: st.download_button( label="Download Filtered Data as CSV", data=data.to_csv(index=False).encode(), file_name="crawler_doi_citation.csv", key="crawler_doi_citation", )