Spaces:
Runtime error
Runtime error
Eleonora Bernasconi commited on
Commit ·
716b2bd
1
Parent(s): 438c3c8
update dblp code
Browse files- __pycache__/knowledge_extraction.cpython-37.pyc +0 -0
- __pycache__/merge.cpython-37.pyc +0 -0
- knowledge_extraction.py +14 -8
- merge.py +3 -28
- output.csv +0 -0
- output/crawler_doi_citation.csv +0 -0
- output/output_crawled_data.csv +0 -0
- output/scholar_dblp_semantics.csv +0 -0
- outputMerged.csv +0 -0
__pycache__/knowledge_extraction.cpython-37.pyc
CHANGED
|
Binary files a/__pycache__/knowledge_extraction.cpython-37.pyc and b/__pycache__/knowledge_extraction.cpython-37.pyc differ
|
|
|
__pycache__/merge.cpython-37.pyc
CHANGED
|
Binary files a/__pycache__/merge.cpython-37.pyc and b/__pycache__/merge.cpython-37.pyc differ
|
|
|
knowledge_extraction.py
CHANGED
|
@@ -60,20 +60,24 @@ def knowledge_extraction():
|
|
| 60 |
|
| 61 |
# Iterate over rows and update 'doi' column if necessary
|
| 62 |
for index, row in data.iterrows():
|
| 63 |
-
doi = row['
|
| 64 |
-
title = row['
|
| 65 |
|
| 66 |
# If 'doi' is None, attempt to get DOI from title
|
| 67 |
-
if
|
| 68 |
doi = scholarly.get_doi_from_title(title)
|
| 69 |
# Update the DataFrame with the retrieved DOI
|
| 70 |
if doi:
|
| 71 |
-
data.at[index, '
|
| 72 |
|
| 73 |
# Display the updated data table
|
| 74 |
st.write("Data with DOI")
|
|
|
|
|
|
|
|
|
|
| 75 |
st.write(data)
|
| 76 |
|
|
|
|
| 77 |
# Step 3: Loop over DOIs and retrieve citation counts
|
| 78 |
step_3 = st.sidebar.checkbox("3 - Loop over DOIs and retrieve citation counts")
|
| 79 |
if step_3:
|
|
@@ -82,7 +86,7 @@ def knowledge_extraction():
|
|
| 82 |
|
| 83 |
# Loop over DOIs and retrieve citation counts
|
| 84 |
for index, row in data.iterrows():
|
| 85 |
-
doi = row['
|
| 86 |
if doi:
|
| 87 |
citation_count = scholarly.get_citation_count(doi)
|
| 88 |
if citation_count is not None:
|
|
@@ -91,7 +95,7 @@ def knowledge_extraction():
|
|
| 91 |
count += 1
|
| 92 |
else:
|
| 93 |
# Handle cases where DOI is None (e.g., bytitle lookup)
|
| 94 |
-
title = row['
|
| 95 |
doi_bytitle = scholarly.get_doi_from_title(str(title))
|
| 96 |
citation_count_title = scholarly.get_citation_count(doi_bytitle)
|
| 97 |
if citation_count_title is not None:
|
|
@@ -102,12 +106,14 @@ def knowledge_extraction():
|
|
| 102 |
# Add the citation count column to the DataFrame
|
| 103 |
data['Citation Count'] = cit_array
|
| 104 |
st.write(data)
|
|
|
|
|
|
|
| 105 |
|
| 106 |
# Step 4: Download Filtered Data as CSV
|
| 107 |
if not data.empty and step_3:
|
| 108 |
st.download_button(
|
| 109 |
label="Download Filtered Data as CSV",
|
| 110 |
data=data.to_csv(index=False).encode(),
|
| 111 |
-
file_name="
|
| 112 |
-
key="
|
| 113 |
)
|
|
|
|
| 60 |
|
| 61 |
# Iterate over rows and update 'doi' column if necessary
|
| 62 |
for index, row in data.iterrows():
|
| 63 |
+
doi = row['DOI']
|
| 64 |
+
title = row['Title']
|
| 65 |
|
| 66 |
# If 'doi' is None, attempt to get DOI from title
|
| 67 |
+
if doi == 'None':
|
| 68 |
doi = scholarly.get_doi_from_title(title)
|
| 69 |
# Update the DataFrame with the retrieved DOI
|
| 70 |
if doi:
|
| 71 |
+
data.at[index, 'DOI'] = doi
|
| 72 |
|
| 73 |
# Display the updated data table
|
| 74 |
st.write("Data with DOI")
|
| 75 |
+
count_none = (data['DOI'] == 'None').sum()
|
| 76 |
+
total_values = len(data['DOI'])
|
| 77 |
+
st.write("Number of values in DOI column equal to None / Total values: " + str(count_none) + "/" + str(total_values))
|
| 78 |
st.write(data)
|
| 79 |
|
| 80 |
+
|
| 81 |
# Step 3: Loop over DOIs and retrieve citation counts
|
| 82 |
step_3 = st.sidebar.checkbox("3 - Loop over DOIs and retrieve citation counts")
|
| 83 |
if step_3:
|
|
|
|
| 86 |
|
| 87 |
# Loop over DOIs and retrieve citation counts
|
| 88 |
for index, row in data.iterrows():
|
| 89 |
+
doi = row['DOI']
|
| 90 |
if doi:
|
| 91 |
citation_count = scholarly.get_citation_count(doi)
|
| 92 |
if citation_count is not None:
|
|
|
|
| 95 |
count += 1
|
| 96 |
else:
|
| 97 |
# Handle cases where DOI is None (e.g., bytitle lookup)
|
| 98 |
+
title = row['Title']
|
| 99 |
doi_bytitle = scholarly.get_doi_from_title(str(title))
|
| 100 |
citation_count_title = scholarly.get_citation_count(doi_bytitle)
|
| 101 |
if citation_count_title is not None:
|
|
|
|
| 106 |
# Add the citation count column to the DataFrame
|
| 107 |
data['Citation Count'] = cit_array
|
| 108 |
st.write(data)
|
| 109 |
+
output_path = "output/crawler_doi_citation.csv"
|
| 110 |
+
data.to_csv(output_path, sep=';', index=False)
|
| 111 |
|
| 112 |
# Step 4: Download Filtered Data as CSV
|
| 113 |
if not data.empty and step_3:
|
| 114 |
st.download_button(
|
| 115 |
label="Download Filtered Data as CSV",
|
| 116 |
data=data.to_csv(index=False).encode(),
|
| 117 |
+
file_name="crawler_doi_citation.csv",
|
| 118 |
+
key="crawler_doi_citation",
|
| 119 |
)
|
merge.py
CHANGED
|
@@ -36,10 +36,12 @@ def merge_and_enrich_datasets(scholar_df, semantics_df, dblp_df):
|
|
| 36 |
# Aggiungi solo le colonne 'doi' e 'citazioni' da scholar_matches
|
| 37 |
merged_record['doi_scholar'] = scholar_matches.iloc[0]['DOI']
|
| 38 |
merged_record['cites_scholar'] = scholar_matches.iloc[0]['Cites']
|
|
|
|
| 39 |
if not semantics_matches.empty:
|
| 40 |
# Aggiungi solo le colonne 'doi' e 'citazioni' da semantics_matches
|
| 41 |
merged_record['doi_semantic'] = semantics_matches.iloc[0]['DOI']
|
| 42 |
merged_record['cites_semantic'] = semantics_matches.iloc[0]['Cites']
|
|
|
|
| 43 |
|
| 44 |
merged_records.append(merged_record)
|
| 45 |
# Scrivi il record nel file CSV
|
|
@@ -60,39 +62,12 @@ def merge_and_enrich_datasets(scholar_df, semantics_df, dblp_df):
|
|
| 60 |
def main():
|
| 61 |
# Interfaccia utente Streamlit
|
| 62 |
st.title("Knowledge enrichment")
|
| 63 |
-
|
| 64 |
-
if st.button("generate dblp csv"):
|
| 65 |
-
# Carica il file XML
|
| 66 |
-
tree = ET.parse('output/dblp.xml')
|
| 67 |
-
root = tree.getroot()
|
| 68 |
-
# Crea un file CSV per scrivere i dati
|
| 69 |
-
with open('output/dblp.csv', 'w', newline='', encoding='utf-8') as csvfile:
|
| 70 |
-
csvwriter = csv.writer(csvfile, delimiter=';')
|
| 71 |
-
|
| 72 |
-
# Scrivi l'intestazione del CSV
|
| 73 |
-
csvwriter.writerow(['DBLP', 'Authors', 'Title', 'Year', 'DOI', 'URL'])
|
| 74 |
-
|
| 75 |
-
# Estrai i dati XML e scrivili nel CSV
|
| 76 |
-
for hit in root.findall('.//hit'):
|
| 77 |
-
authors = ', '.join(author.text for author in hit.findall('.//authors/author'))
|
| 78 |
-
title = hit.find('.//title').text
|
| 79 |
-
year = hit.find('.//year').text
|
| 80 |
-
doi_element = hit.find('.//doi')
|
| 81 |
-
doi = doi_element.text if doi_element is not None else 'None'
|
| 82 |
-
url = hit.find('.//url').text
|
| 83 |
-
id = hit.attrib['id']
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
# Scrivi i dati nel file CSV
|
| 87 |
-
csvwriter.writerow([id, authors, title, year, doi, url])
|
| 88 |
-
|
| 89 |
-
print("Conversione XML in CSV completata.")
|
| 90 |
|
| 91 |
if st.button("create enriched dataset"):
|
| 92 |
# Carica i file CSV in DataFrame
|
| 93 |
scholar_df = pd.read_csv("output/googleScholarcsv.csv", sep=";", encoding='utf-8')
|
| 94 |
semantics_df = pd.read_csv("output/semanticscholarcsv.csv", sep=";", encoding='utf-8')
|
| 95 |
-
dblp_df = pd.read_csv("output/
|
| 96 |
|
| 97 |
# Call the merge_and_enrich_datasets function
|
| 98 |
enriched_dataset = merge_and_enrich_datasets(scholar_df, semantics_df, dblp_df)
|
|
|
|
| 36 |
# Aggiungi solo le colonne 'doi' e 'citazioni' da scholar_matches
|
| 37 |
merged_record['doi_scholar'] = scholar_matches.iloc[0]['DOI']
|
| 38 |
merged_record['cites_scholar'] = scholar_matches.iloc[0]['Cites']
|
| 39 |
+
merged_record['abstract_scholar'] = scholar_matches.iloc[0]['Abstract']
|
| 40 |
if not semantics_matches.empty:
|
| 41 |
# Aggiungi solo le colonne 'doi' e 'citazioni' da semantics_matches
|
| 42 |
merged_record['doi_semantic'] = semantics_matches.iloc[0]['DOI']
|
| 43 |
merged_record['cites_semantic'] = semantics_matches.iloc[0]['Cites']
|
| 44 |
+
merged_record['abstract_scholar'] = 'None'
|
| 45 |
|
| 46 |
merged_records.append(merged_record)
|
| 47 |
# Scrivi il record nel file CSV
|
|
|
|
| 62 |
def main():
|
| 63 |
# Interfaccia utente Streamlit
|
| 64 |
st.title("Knowledge enrichment")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
if st.button("create enriched dataset"):
|
| 67 |
# Carica i file CSV in DataFrame
|
| 68 |
scholar_df = pd.read_csv("output/googleScholarcsv.csv", sep=";", encoding='utf-8')
|
| 69 |
semantics_df = pd.read_csv("output/semanticscholarcsv.csv", sep=";", encoding='utf-8')
|
| 70 |
+
dblp_df = pd.read_csv("output/crawler_doi_citation.csv", sep=";", encoding='utf-8')
|
| 71 |
|
| 72 |
# Call the merge_and_enrich_datasets function
|
| 73 |
enriched_dataset = merge_and_enrich_datasets(scholar_df, semantics_df, dblp_df)
|
output.csv
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
output/crawler_doi_citation.csv
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
output/output_crawled_data.csv
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
output/scholar_dblp_semantics.csv
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
outputMerged.csv
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|