raannakasturi commited on
Commit
cf4432c
·
verified ·
1 Parent(s): 73baaf7

Update fetch_paper_data.py

Browse files
Files changed (1) hide show
  1. fetch_paper_data.py +71 -70
fetch_paper_data.py CHANGED
@@ -1,71 +1,72 @@
1
- import re
2
- import requests
3
- from bs4 import BeautifulSoup
4
- from xml.etree import ElementTree as ET
5
- import json
6
-
7
- HEADERS = {
8
- 'User-Agent': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/131.0.6778.135 Safari/537.36'
9
- }
10
-
11
- def fetch_pmc_doi(pmc_id):
12
- url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?email=raannakasturi@gmail.com&ids={pmc_id}&format=json"
13
- response = requests.get(url, headers=HEADERS).json()
14
- if response['status'] == 'ok':
15
- doi = response['records'][0]['doi']
16
- return f"https://doi.org/{doi}"
17
-
18
- def fetch_pmc_pdf(pmc_id):
19
- url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id={pmc_id}&format=pdf"
20
- response = requests.get(url, headers=HEADERS).content
21
- records = ET.fromstring(response).find('records').findall('record')
22
- for record in records:
23
- if record.attrib['id'] == pmc_id:
24
- pdf_url = record.find('link').attrib['href']
25
- return pdf_url.replace('ftp://', 'https://')
26
- else:
27
- return None
28
-
29
- def fetch_arxiv_doi(arxiv_id):
30
- page_url = f"https://arxiv.org/abs/{arxiv_id}"
31
- page_content = requests.get(page_url, headers=HEADERS).content
32
- page_data = BeautifulSoup(page_content, 'html.parser')
33
- doi = page_data.find('td', {'class': "tablecell arxivdoi"}).find('a', {'id': 'arxiv-doi-link'}).text
34
- return doi
35
-
36
- def fetch_citation(doi):
37
- citation_content = requests.get(doi, headers={ 'User-Agent':HEADERS['User-Agent'], 'Accept': 'text/x-bibliography; style=apa'}).content
38
- return citation_content.decode('utf-8')
39
-
40
- def fetch_title(doi):
41
- title_content = requests.get(doi, headers={ 'User-Agent':HEADERS['User-Agent'], 'Accept': 'text/x-bibliography; style=bibtex'}).content
42
- bibtex_entry = title_content.decode('utf-8').strip()
43
- title = re.search(r'title\s*=\s*{(.*?)}', bibtex_entry)
44
- if title:
45
- return title.group(1)
46
- return None
47
-
48
- def fetch_paper_data(id):
49
- data = {}
50
- try:
51
- if id.startswith('PMC'):
52
- doi = fetch_pmc_doi(id)
53
- pdf_url = fetch_pmc_pdf(id)
54
- else:
55
- doi = fetch_arxiv_doi(id)
56
- pdf_url = f"https://arxiv.org/pdf/{id}"
57
- citation = fetch_citation(doi).replace('\n', ' ').strip()
58
- title = fetch_title(doi).replace('\n', ' ').strip()
59
- data['status'] = 'success'
60
- data['doi'] = doi
61
- data['title'] = title
62
- data['pdf_url'] = pdf_url
63
- data['citation'] = citation
64
- except Exception as e:
65
- data['status'] = 'error'
66
- print(str(e))
67
- return json.dumps(data, indent=4, ensure_ascii=False)
68
-
69
- if __name__ == '__main__':
70
- citation = fetch_paper_data('PMC8391798')
 
71
  print(citation)
 
1
+ import re
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ from xml.etree import ElementTree as ET
5
+ import json
6
+
7
+ HEADERS = {
8
+ 'User-Agent': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/131.0.6778.135 Safari/537.36'
9
+ }
10
+
11
+ def fetch_pmc_doi(pmc_id):
12
+ url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?email=raannakasturi@gmail.com&ids={pmc_id}&format=json"
13
+ response = requests.get(url, headers=HEADERS).json()
14
+ if response['status'] == 'ok':
15
+ doi = response['records'][0]['doi']
16
+ return f"https://doi.org/{doi}"
17
+
18
+ def fetch_pmc_pdf(pmc_id):
19
+ url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id={pmc_id}&format=pdf"
20
+ response = requests.get(url, headers=HEADERS).content
21
+ records = ET.fromstring(response).find('records').findall('record')
22
+ for record in records:
23
+ if record.attrib['id'] == pmc_id:
24
+ pdf_url = record.find('link').attrib['href']
25
+ return pdf_url.replace('ftp://', 'https://')
26
+ else:
27
+ return None
28
+
29
+ def fetch_arxiv_doi(arxiv_id):
30
+ page_url = f"https://arxiv.org/abs/{arxiv_id}"
31
+ page_content = requests.get(page_url, headers=HEADERS).content
32
+ page_data = BeautifulSoup(page_content, 'html.parser')
33
+ doi = page_data.find('td', {'class': "tablecell arxivdoi"}).find('a', {'id': 'arxiv-doi-link'}).text
34
+ return doi
35
+
36
+ def fetch_citation(doi):
37
+ citation_content = requests.get(doi, headers={ 'User-Agent':HEADERS['User-Agent'], 'Accept': 'text/x-bibliography; style=apa'}).content
38
+ return citation_content.decode('utf-8')
39
+
40
+ def fetch_title(doi):
41
+ title_content = requests.get(doi, headers={ 'User-Agent':HEADERS['User-Agent'], 'Accept': 'text/x-bibliography; style=bibtex'}).content
42
+ bibtex_entry = title_content.decode('utf-8').strip()
43
+ title = re.search(r'title\s*=\s*{(.*?)}', bibtex_entry)
44
+ if title:
45
+ return title.group(1)
46
+ return None
47
+
48
+ def fetch_paper_data(id):
49
+ data = {}
50
+ try:
51
+ if id.startswith('PMC'):
52
+ doi = fetch_pmc_doi(id)
53
+ pdf_url = fetch_pmc_pdf(id)
54
+ else:
55
+ doi = fetch_arxiv_doi(id)
56
+ pdf_url = f"https://arxiv.org/pdf/{id}"
57
+ citation = fetch_citation(doi).replace('\n', ' ').strip()
58
+ title = fetch_title(doi).replace('\n', ' ').strip()
59
+ data['status'] = 'success'
60
+ data['data'] = {}
61
+ data['data']['doi'] = doi
62
+ data['data']['title'] = title
63
+ data['data']['pdf_url'] = pdf_url
64
+ data['data']['citation'] = citation
65
+ except Exception as e:
66
+ data['status'] = 'error'
67
+ print(str(e))
68
+ return json.dumps(data, indent=4, ensure_ascii=False)
69
+
70
+ if __name__ == '__main__':
71
+ citation = fetch_paper_data('PMC8391798')
72
  print(citation)