ircdl_data / knowledge_extraction.py
Eleonora Bernasconi
update dblp code
716b2bd
import streamlit as st
import xml.etree.ElementTree as ET
import csv
def knowledge_extraction():
import streamlit as st
import pandas as pd
import scholarly
st.title("CSV Data Viewer")
def load_data():
data = pd.read_csv("output/dblp.csv", sep=";", encoding="latin1")
return data
data = load_data()
if st.button("generate dblp csv"):
# Carica il file XML
tree = ET.parse('output/dblp.xml')
root = tree.getroot()
# Crea un file CSV per scrivere i dati
with open('output/dblp.csv', 'w', newline='', encoding='utf-8') as csvfile:
csvwriter = csv.writer(csvfile, delimiter=';')
# Scrivi l'intestazione del CSV
csvwriter.writerow(['DBLP', 'Authors', 'Title', 'Year', 'DOI', 'URL'])
# Estrai i dati XML e scrivili nel CSV
for hit in root.findall('.//hit'):
authors = ', '.join(author.text for author in hit.findall('.//authors/author'))
title = hit.find('.//title').text
year = hit.find('.//year').text
doi_element = hit.find('.//doi')
doi = doi_element.text if doi_element is not None else 'None'
url = hit.find('.//url').text
id = hit.attrib['id']
# Scrivi i dati nel file CSV
csvwriter.writerow([id, authors, title, year, doi, url])
print("Conversione XML in CSV completata.")
count_none = (data['DOI'] == 'None').sum()
total_values = len(data['DOI'])
st.write("Number of values in DOI column equal to None / Total values: " + str(count_none) + "/" + str(total_values))
# Step 1: Display the data retrieved from DBLP
step_1 = st.sidebar.checkbox("1 - Display the data retrieved from DBLP (Digital Bibliography & Library Project)")
if step_1:
st.write("Data from DBLP:")
st.write(data)
# Step 2: Iterate over rows and update 'doi' column if necessary
step_2 = st.sidebar.checkbox("2 - Iterate over rows and update 'doi' column if necessary")
if step_2:
cit_array = []
count = 0
st.write(count)
# Iterate over rows and update 'doi' column if necessary
for index, row in data.iterrows():
doi = row['DOI']
title = row['Title']
# If 'doi' is None, attempt to get DOI from title
if doi == 'None':
doi = scholarly.get_doi_from_title(title)
# Update the DataFrame with the retrieved DOI
if doi:
data.at[index, 'DOI'] = doi
# Display the updated data table
st.write("Data with DOI")
count_none = (data['DOI'] == 'None').sum()
total_values = len(data['DOI'])
st.write("Number of values in DOI column equal to None / Total values: " + str(count_none) + "/" + str(total_values))
st.write(data)
# Step 3: Loop over DOIs and retrieve citation counts
step_3 = st.sidebar.checkbox("3 - Loop over DOIs and retrieve citation counts")
if step_3:
cit_array = []
count = 0
# Loop over DOIs and retrieve citation counts
for index, row in data.iterrows():
doi = row['DOI']
if doi:
citation_count = scholarly.get_citation_count(doi)
if citation_count is not None:
cit_array.append(citation_count)
st.text(f"DOI: {doi}, Citation Count: {citation_count}")
count += 1
else:
# Handle cases where DOI is None (e.g., bytitle lookup)
title = row['Title']
doi_bytitle = scholarly.get_doi_from_title(str(title))
citation_count_title = scholarly.get_citation_count(doi_bytitle)
if citation_count_title is not None:
count += 1
st.text(f"DOI from Title: {title}, Citation Count: {citation_count_title}")
cit_array.append(citation_count_title)
# Add the citation count column to the DataFrame
data['Citation Count'] = cit_array
st.write(data)
output_path = "output/crawler_doi_citation.csv"
data.to_csv(output_path, sep=';', index=False)
# Step 4: Download Filtered Data as CSV
if not data.empty and step_3:
st.download_button(
label="Download Filtered Data as CSV",
data=data.to_csv(index=False).encode(),
file_name="crawler_doi_citation.csv",
key="crawler_doi_citation",
)