Spaces:
Runtime error
Runtime error
File size: 4,719 Bytes
ea361ad 438c3c8 ea361ad 438c3c8 ea361ad 438c3c8 ea361ad 438c3c8 ea361ad 438c3c8 ea361ad 716b2bd ea361ad 716b2bd ea361ad 716b2bd ea361ad 716b2bd ea361ad 716b2bd ea361ad 716b2bd ea361ad 716b2bd ea361ad 716b2bd ea361ad 716b2bd ea361ad | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 | import streamlit as st
import xml.etree.ElementTree as ET
import csv
def knowledge_extraction():
import streamlit as st
import pandas as pd
import scholarly
st.title("CSV Data Viewer")
def load_data():
data = pd.read_csv("output/dblp.csv", sep=";", encoding="latin1")
return data
data = load_data()
if st.button("generate dblp csv"):
# Carica il file XML
tree = ET.parse('output/dblp.xml')
root = tree.getroot()
# Crea un file CSV per scrivere i dati
with open('output/dblp.csv', 'w', newline='', encoding='utf-8') as csvfile:
csvwriter = csv.writer(csvfile, delimiter=';')
# Scrivi l'intestazione del CSV
csvwriter.writerow(['DBLP', 'Authors', 'Title', 'Year', 'DOI', 'URL'])
# Estrai i dati XML e scrivili nel CSV
for hit in root.findall('.//hit'):
authors = ', '.join(author.text for author in hit.findall('.//authors/author'))
title = hit.find('.//title').text
year = hit.find('.//year').text
doi_element = hit.find('.//doi')
doi = doi_element.text if doi_element is not None else 'None'
url = hit.find('.//url').text
id = hit.attrib['id']
# Scrivi i dati nel file CSV
csvwriter.writerow([id, authors, title, year, doi, url])
print("Conversione XML in CSV completata.")
count_none = (data['DOI'] == 'None').sum()
total_values = len(data['DOI'])
st.write("Number of values in DOI column equal to None / Total values: " + str(count_none) + "/" + str(total_values))
# Step 1: Display the data retrieved from DBLP
step_1 = st.sidebar.checkbox("1 - Display the data retrieved from DBLP (Digital Bibliography & Library Project)")
if step_1:
st.write("Data from DBLP:")
st.write(data)
# Step 2: Iterate over rows and update 'doi' column if necessary
step_2 = st.sidebar.checkbox("2 - Iterate over rows and update 'doi' column if necessary")
if step_2:
cit_array = []
count = 0
st.write(count)
# Iterate over rows and update 'doi' column if necessary
for index, row in data.iterrows():
doi = row['DOI']
title = row['Title']
# If 'doi' is None, attempt to get DOI from title
if doi == 'None':
doi = scholarly.get_doi_from_title(title)
# Update the DataFrame with the retrieved DOI
if doi:
data.at[index, 'DOI'] = doi
# Display the updated data table
st.write("Data with DOI")
count_none = (data['DOI'] == 'None').sum()
total_values = len(data['DOI'])
st.write("Number of values in DOI column equal to None / Total values: " + str(count_none) + "/" + str(total_values))
st.write(data)
# Step 3: Loop over DOIs and retrieve citation counts
step_3 = st.sidebar.checkbox("3 - Loop over DOIs and retrieve citation counts")
if step_3:
cit_array = []
count = 0
# Loop over DOIs and retrieve citation counts
for index, row in data.iterrows():
doi = row['DOI']
if doi:
citation_count = scholarly.get_citation_count(doi)
if citation_count is not None:
cit_array.append(citation_count)
st.text(f"DOI: {doi}, Citation Count: {citation_count}")
count += 1
else:
# Handle cases where DOI is None (e.g., bytitle lookup)
title = row['Title']
doi_bytitle = scholarly.get_doi_from_title(str(title))
citation_count_title = scholarly.get_citation_count(doi_bytitle)
if citation_count_title is not None:
count += 1
st.text(f"DOI from Title: {title}, Citation Count: {citation_count_title}")
cit_array.append(citation_count_title)
# Add the citation count column to the DataFrame
data['Citation Count'] = cit_array
st.write(data)
output_path = "output/crawler_doi_citation.csv"
data.to_csv(output_path, sep=';', index=False)
# Step 4: Download Filtered Data as CSV
if not data.empty and step_3:
st.download_button(
label="Download Filtered Data as CSV",
data=data.to_csv(index=False).encode(),
file_name="crawler_doi_citation.csv",
key="crawler_doi_citation",
)
|