Eleonora Bernasconi commited on
Commit
716b2bd
·
1 Parent(s): 438c3c8

update dblp code

Browse files
__pycache__/knowledge_extraction.cpython-37.pyc CHANGED
Binary files a/__pycache__/knowledge_extraction.cpython-37.pyc and b/__pycache__/knowledge_extraction.cpython-37.pyc differ
 
__pycache__/merge.cpython-37.pyc CHANGED
Binary files a/__pycache__/merge.cpython-37.pyc and b/__pycache__/merge.cpython-37.pyc differ
 
knowledge_extraction.py CHANGED
@@ -60,20 +60,24 @@ def knowledge_extraction():
60
 
61
  # Iterate over rows and update 'doi' column if necessary
62
  for index, row in data.iterrows():
63
- doi = row['doi']
64
- title = row['title']
65
 
66
  # If 'doi' is None, attempt to get DOI from title
67
- if pd.isnull(doi):
68
  doi = scholarly.get_doi_from_title(title)
69
  # Update the DataFrame with the retrieved DOI
70
  if doi:
71
- data.at[index, 'doi'] = doi
72
 
73
  # Display the updated data table
74
  st.write("Data with DOI")
 
 
 
75
  st.write(data)
76
 
 
77
  # Step 3: Loop over DOIs and retrieve citation counts
78
  step_3 = st.sidebar.checkbox("3 - Loop over DOIs and retrieve citation counts")
79
  if step_3:
@@ -82,7 +86,7 @@ def knowledge_extraction():
82
 
83
  # Loop over DOIs and retrieve citation counts
84
  for index, row in data.iterrows():
85
- doi = row['doi']
86
  if doi:
87
  citation_count = scholarly.get_citation_count(doi)
88
  if citation_count is not None:
@@ -91,7 +95,7 @@ def knowledge_extraction():
91
  count += 1
92
  else:
93
  # Handle cases where DOI is None (e.g., bytitle lookup)
94
- title = row['title']
95
  doi_bytitle = scholarly.get_doi_from_title(str(title))
96
  citation_count_title = scholarly.get_citation_count(doi_bytitle)
97
  if citation_count_title is not None:
@@ -102,12 +106,14 @@ def knowledge_extraction():
102
  # Add the citation count column to the DataFrame
103
  data['Citation Count'] = cit_array
104
  st.write(data)
 
 
105
 
106
  # Step 4: Download Filtered Data as CSV
107
  if not data.empty and step_3:
108
  st.download_button(
109
  label="Download Filtered Data as CSV",
110
  data=data.to_csv(index=False).encode(),
111
- file_name="filtered_data.csv",
112
- key="download_filtered_data",
113
  )
 
60
 
61
  # Iterate over rows and update 'doi' column if necessary
62
  for index, row in data.iterrows():
63
+ doi = row['DOI']
64
+ title = row['Title']
65
 
66
  # If 'doi' is None, attempt to get DOI from title
67
+ if doi == 'None':
68
  doi = scholarly.get_doi_from_title(title)
69
  # Update the DataFrame with the retrieved DOI
70
  if doi:
71
+ data.at[index, 'DOI'] = doi
72
 
73
  # Display the updated data table
74
  st.write("Data with DOI")
75
+ count_none = (data['DOI'] == 'None').sum()
76
+ total_values = len(data['DOI'])
77
+ st.write("Number of values in DOI column equal to None / Total values: " + str(count_none) + "/" + str(total_values))
78
  st.write(data)
79
 
80
+
81
  # Step 3: Loop over DOIs and retrieve citation counts
82
  step_3 = st.sidebar.checkbox("3 - Loop over DOIs and retrieve citation counts")
83
  if step_3:
 
86
 
87
  # Loop over DOIs and retrieve citation counts
88
  for index, row in data.iterrows():
89
+ doi = row['DOI']
90
  if doi:
91
  citation_count = scholarly.get_citation_count(doi)
92
  if citation_count is not None:
 
95
  count += 1
96
  else:
97
  # Handle cases where DOI is None (e.g., bytitle lookup)
98
+ title = row['Title']
99
  doi_bytitle = scholarly.get_doi_from_title(str(title))
100
  citation_count_title = scholarly.get_citation_count(doi_bytitle)
101
  if citation_count_title is not None:
 
106
  # Add the citation count column to the DataFrame
107
  data['Citation Count'] = cit_array
108
  st.write(data)
109
+ output_path = "output/crawler_doi_citation.csv"
110
+ data.to_csv(output_path, sep=';', index=False)
111
 
112
  # Step 4: Download Filtered Data as CSV
113
  if not data.empty and step_3:
114
  st.download_button(
115
  label="Download Filtered Data as CSV",
116
  data=data.to_csv(index=False).encode(),
117
+ file_name="crawler_doi_citation.csv",
118
+ key="crawler_doi_citation",
119
  )
merge.py CHANGED
@@ -36,10 +36,12 @@ def merge_and_enrich_datasets(scholar_df, semantics_df, dblp_df):
36
  # Aggiungi solo le colonne 'doi' e 'citazioni' da scholar_matches
37
  merged_record['doi_scholar'] = scholar_matches.iloc[0]['DOI']
38
  merged_record['cites_scholar'] = scholar_matches.iloc[0]['Cites']
 
39
  if not semantics_matches.empty:
40
  # Aggiungi solo le colonne 'doi' e 'citazioni' da semantics_matches
41
  merged_record['doi_semantic'] = semantics_matches.iloc[0]['DOI']
42
  merged_record['cites_semantic'] = semantics_matches.iloc[0]['Cites']
 
43
 
44
  merged_records.append(merged_record)
45
  # Scrivi il record nel file CSV
@@ -60,39 +62,12 @@ def merge_and_enrich_datasets(scholar_df, semantics_df, dblp_df):
60
  def main():
61
  # Interfaccia utente Streamlit
62
  st.title("Knowledge enrichment")
63
-
64
- if st.button("generate dblp csv"):
65
- # Carica il file XML
66
- tree = ET.parse('output/dblp.xml')
67
- root = tree.getroot()
68
- # Crea un file CSV per scrivere i dati
69
- with open('output/dblp.csv', 'w', newline='', encoding='utf-8') as csvfile:
70
- csvwriter = csv.writer(csvfile, delimiter=';')
71
-
72
- # Scrivi l'intestazione del CSV
73
- csvwriter.writerow(['DBLP', 'Authors', 'Title', 'Year', 'DOI', 'URL'])
74
-
75
- # Estrai i dati XML e scrivili nel CSV
76
- for hit in root.findall('.//hit'):
77
- authors = ', '.join(author.text for author in hit.findall('.//authors/author'))
78
- title = hit.find('.//title').text
79
- year = hit.find('.//year').text
80
- doi_element = hit.find('.//doi')
81
- doi = doi_element.text if doi_element is not None else 'None'
82
- url = hit.find('.//url').text
83
- id = hit.attrib['id']
84
-
85
-
86
- # Scrivi i dati nel file CSV
87
- csvwriter.writerow([id, authors, title, year, doi, url])
88
-
89
- print("Conversione XML in CSV completata.")
90
 
91
  if st.button("create enriched dataset"):
92
  # Carica i file CSV in DataFrame
93
  scholar_df = pd.read_csv("output/googleScholarcsv.csv", sep=";", encoding='utf-8')
94
  semantics_df = pd.read_csv("output/semanticscholarcsv.csv", sep=";", encoding='utf-8')
95
- dblp_df = pd.read_csv("output/dblp.csv", sep=";", encoding='utf-8')
96
 
97
  # Call the merge_and_enrich_datasets function
98
  enriched_dataset = merge_and_enrich_datasets(scholar_df, semantics_df, dblp_df)
 
36
  # Aggiungi solo le colonne 'doi' e 'citazioni' da scholar_matches
37
  merged_record['doi_scholar'] = scholar_matches.iloc[0]['DOI']
38
  merged_record['cites_scholar'] = scholar_matches.iloc[0]['Cites']
39
+ merged_record['abstract_scholar'] = scholar_matches.iloc[0]['Abstract']
40
  if not semantics_matches.empty:
41
  # Aggiungi solo le colonne 'doi' e 'citazioni' da semantics_matches
42
  merged_record['doi_semantic'] = semantics_matches.iloc[0]['DOI']
43
  merged_record['cites_semantic'] = semantics_matches.iloc[0]['Cites']
44
+ merged_record['abstract_scholar'] = 'None'
45
 
46
  merged_records.append(merged_record)
47
  # Scrivi il record nel file CSV
 
62
  def main():
63
  # Interfaccia utente Streamlit
64
  st.title("Knowledge enrichment")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  if st.button("create enriched dataset"):
67
  # Carica i file CSV in DataFrame
68
  scholar_df = pd.read_csv("output/googleScholarcsv.csv", sep=";", encoding='utf-8')
69
  semantics_df = pd.read_csv("output/semanticscholarcsv.csv", sep=";", encoding='utf-8')
70
+ dblp_df = pd.read_csv("output/crawler_doi_citation.csv", sep=";", encoding='utf-8')
71
 
72
  # Call the merge_and_enrich_datasets function
73
  enriched_dataset = merge_and_enrich_datasets(scholar_df, semantics_df, dblp_df)
output.csv DELETED
The diff for this file is too large to render. See raw diff
 
output/crawler_doi_citation.csv CHANGED
The diff for this file is too large to render. See raw diff
 
output/output_crawled_data.csv DELETED
The diff for this file is too large to render. See raw diff
 
output/scholar_dblp_semantics.csv CHANGED
The diff for this file is too large to render. See raw diff
 
outputMerged.csv CHANGED
The diff for this file is too large to render. See raw diff