Commit
·
eeea145
1
Parent(s):
6018f49
Refactor ID extraction logic in arvix and improve DOI fetching in pmc
Browse files
arvix.py
CHANGED
|
@@ -113,7 +113,6 @@ def extract_arxiv_data():
|
|
| 113 |
for temp_id in temp_id_storage:
|
| 114 |
all_ids.append(temp_id)
|
| 115 |
random.shuffle(all_ids)
|
| 116 |
-
print(len(all_ids))
|
| 117 |
if len(all_ids) > 12:
|
| 118 |
print(f"Found more than 12 papers for {category}. Randomly selecting 12 papers.")
|
| 119 |
all_ids = all_ids[:12]
|
|
@@ -124,7 +123,6 @@ def extract_arxiv_data():
|
|
| 124 |
if not tools.upload_datafile('arxiv.txt'):
|
| 125 |
raise Exception("Failed to upload datafile")
|
| 126 |
return data
|
| 127 |
-
|
| 128 |
|
| 129 |
if __name__ == '__main__':
|
| 130 |
data = extract_arxiv_data()
|
|
|
|
| 113 |
for temp_id in temp_id_storage:
|
| 114 |
all_ids.append(temp_id)
|
| 115 |
random.shuffle(all_ids)
|
|
|
|
| 116 |
if len(all_ids) > 12:
|
| 117 |
print(f"Found more than 12 papers for {category}. Randomly selecting 12 papers.")
|
| 118 |
all_ids = all_ids[:12]
|
|
|
|
| 123 |
if not tools.upload_datafile('arxiv.txt'):
|
| 124 |
raise Exception("Failed to upload datafile")
|
| 125 |
return data
|
|
|
|
| 126 |
|
| 127 |
if __name__ == '__main__':
|
| 128 |
data = extract_arxiv_data()
|
pmc.py
CHANGED
|
@@ -32,11 +32,14 @@ def fetch_dois():
|
|
| 32 |
page_content = tools.fetch_page(link)
|
| 33 |
page_datas = BeautifulSoup(page_content, 'html.parser').find_all("div", id="journal_references")
|
| 34 |
for page_data in page_datas:
|
| 35 |
-
|
| 36 |
-
if doi.startswith('10.'):
|
| 37 |
-
doi_list.append(doi)
|
| 38 |
-
else:
|
| 39 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
doi_data[topic] = doi_list
|
| 41 |
data = json.dumps(doi_data, indent=4, ensure_ascii=False)
|
| 42 |
return data
|
|
@@ -48,6 +51,8 @@ def fetch_doi_data():
|
|
| 48 |
thread = threading.Thread(target=fetch_and_store)
|
| 49 |
thread.start()
|
| 50 |
thread.join()
|
|
|
|
|
|
|
| 51 |
return result[0]
|
| 52 |
|
| 53 |
def doi_to_pmc():
|
|
|
|
| 32 |
page_content = tools.fetch_page(link)
|
| 33 |
page_datas = BeautifulSoup(page_content, 'html.parser').find_all("div", id="journal_references")
|
| 34 |
for page_data in page_datas:
|
| 35 |
+
if not page_data.find("a", href=True):
|
|
|
|
|
|
|
|
|
|
| 36 |
continue
|
| 37 |
+
else:
|
| 38 |
+
doi = page_data.find("a", href=True).text
|
| 39 |
+
if doi.startswith('10.'):
|
| 40 |
+
doi_list.append(doi)
|
| 41 |
+
else:
|
| 42 |
+
continue
|
| 43 |
doi_data[topic] = doi_list
|
| 44 |
data = json.dumps(doi_data, indent=4, ensure_ascii=False)
|
| 45 |
return data
|
|
|
|
| 51 |
thread = threading.Thread(target=fetch_and_store)
|
| 52 |
thread.start()
|
| 53 |
thread.join()
|
| 54 |
+
if len(result) == 0 or not result or result[0] == None:
|
| 55 |
+
return []
|
| 56 |
return result[0]
|
| 57 |
|
| 58 |
def doi_to_pmc():
|