Stereo0001 commited on
Commit
651accf
·
verified ·
1 Parent(s): 4c654a2

Create ppp.py

Browse files
Files changed (1) hide show
  1. ppp.py +215 -0
ppp.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+ import re
6
+ import urllib
7
+ import time
8
+
9
+ def get_main_url(url):
10
+ return "/".join(url.split("/")[:3])
11
+
12
+ def save_pdf_from_url(pdf_url, directory, name, headers):
13
+ try:
14
+ response = requests.get(pdf_url, headers=headers, allow_redirects=True)
15
+ response.raise_for_status()
16
+
17
+ if not response.content.startswith(b'%PDF'):
18
+ content_str = response.content.decode('utf-8', errors='ignore')
19
+ if 'Preparing to download' in content_str:
20
+ pmc_match = re.search(r'PMC\d+', pdf_url)
21
+ if pmc_match:
22
+ pmc_id = pmc_match.group()
23
+ alt_url = f"https://europepmc.org/backend/ptpmcrender.fcgi?accid={pmc_id}&blobtype=pdf"
24
+ print(f"** Trying alternative URL: {alt_url}")
25
+ response = requests.get(alt_url, headers=headers, allow_redirects=True)
26
+ response.raise_for_status()
27
+
28
+ with open(f'{directory}/{name}.pdf', 'wb') as f:
29
+ f.write(response.content)
30
+ print(f"** Successfully fetched and saved PDF for PMCID {name}. File size: {len(response.content)} bytes")
31
+ except requests.RequestException as e:
32
+ print(f"** Failed to download PDF from {pdf_url}: {e}")
33
+
34
+ def fetch(pmcid, finders, name, headers, error_pmids, args):
35
+ uri = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid.strip()}"
36
+ success = False
37
+ if os.path.exists(f"{args['out']}/{pmcid}.pdf"):
38
+ print(f"** Reprint #{pmcid} already downloaded and in folder; skipping.")
39
+ return
40
+
41
+ try:
42
+ req = requests.get(uri, headers=headers)
43
+ req.raise_for_status()
44
+ soup = BeautifulSoup(req.content, 'lxml')
45
+ for finder in finders:
46
+ print(f"Trying {finder}")
47
+ pdf_url = eval(finder)(req, soup, headers)
48
+ if pdf_url:
49
+ save_pdf_from_url(pdf_url, args['out'], name, headers)
50
+ success = True
51
+ break
52
+
53
+ if not success:
54
+ print(f"** Reprint {pmcid} could not be fetched with the current finders.")
55
+ error_pmids.write(f"{pmcid}\t{name}\n")
56
+
57
+ except requests.RequestException as e:
58
+ print(f"** Request failed for PMCID {pmcid}: {e}")
59
+ error_pmids.write(f"{pmcid}\t{name}\n")
60
+
61
+ def acs_publications(req, soup, headers):
62
+ links = [x for x in soup.find_all('a') if x.get('title') and ('high-res pdf' in x.get('title').lower() or 'low-res pdf' in x.get('title').lower())]
63
+ if links:
64
+ print("** Using ACS Publications finder...")
65
+ return get_main_url(req.url) + links[0].get('href')
66
+ return None
67
+
68
+ def future_medicine(req, soup, headers):
69
+ links = soup.find_all('a', attrs={'href': re.compile("/doi/pdf")})
70
+ if links:
71
+ print("** Using Future Medicine finder...")
72
+ return get_main_url(req.url) + links[0].get('href')
73
+ return None
74
+
75
+ def generic_citation_labelled(req, soup, headers):
76
+ links = soup.find_all('meta', attrs={'name': 'citation_pdf_url'})
77
+ if links:
78
+ print("** Using Generic Citation Labelled finder...")
79
+ return links[0].get('content')
80
+ return None
81
+
82
+ def nejm(req, soup, headers):
83
+ links = [x for x in soup.find_all('a') if x.get('data-download-type') and x.get('data-download-type').lower() == 'article pdf']
84
+ if links:
85
+ print("** Using NEJM finder...")
86
+ return get_main_url(req.url) + links[0].get('href')
87
+ return None
88
+
89
+ def pubmed_central_v2(req, soup, headers):
90
+ links = soup.find_all('a', attrs={'href': re.compile('/pmc/articles')})
91
+ if links:
92
+ print("** Using PubMed Central V2 finder...")
93
+ return f"https://www.ncbi.nlm.nih.gov{links[0].get('href')}"
94
+ return None
95
+
96
+ def science_direct(req, soup, headers):
97
+ try:
98
+ new_uri = urllib.parse.unquote(soup.find_all('input')[0].get('value'))
99
+ req = requests.get(new_uri, allow_redirects=True, headers=headers)
100
+ req.raise_for_status()
101
+ soup = BeautifulSoup(req.content, 'lxml')
102
+ links = soup.find_all('meta', attrs={'name': 'citation_pdf_url'})
103
+ if links:
104
+ print("** Using Science Direct finder...")
105
+ return links[0].get('content')
106
+ except Exception as e:
107
+ print(f"** Science Direct finder error: {e}")
108
+ return None
109
+
110
+ def uchicago_press(req, soup, headers):
111
+ links = [x for x in soup.find_all('a') if x.get('href') and 'pdf' in x.get('href') and '.edu/doi/' in x.get('href')]
112
+ if links:
113
+ print("** Using UChicago Press finder...")
114
+ return get_main_url(req.url) + links[0].get('href')
115
+ return None
116
+
117
+ def europe_pmc_service(req, soup, headers):
118
+ pmc_match = re.search(r'PMC\d+', req.url)
119
+ if pmc_match:
120
+ pmc_id = pmc_match.group()
121
+ print(f"** Using Europe PMC Service finder for {pmc_id}...")
122
+ return f"https://europepmc.org/backend/ptpmcrender.fcgi?accid={pmc_id}&blobtype=pdf"
123
+ return None
124
+
125
+ def main(pcds):
126
+ args = {
127
+ 'pmcids': f'{pcds}', # 替换为你要下载的 PMCIDs
128
+ 'pmf': '%#$', # 如果使用文件则设置路径
129
+ 'out': 'fetched_pdfs',
130
+ 'errors': 'unfetched_pmcids.tsv',
131
+ 'maxRetries': 3,
132
+ 'batch': 10,
133
+ 'delay': 5
134
+ }
135
+
136
+ if args['pmcids'] == '%#$' and args['pmf'] == '%#$':
137
+ print("Error: 必须提供 pmcids 或 pmf")
138
+ return
139
+ if args['pmcids'] != '%#$' and args['pmf'] != '%#$':
140
+ print("Warning: 同时提供了 pmcids 和 pmf,忽略 pmf")
141
+ args['pmf'] = '%#$'
142
+
143
+ if not os.path.exists(args['out']):
144
+ print(f"创建输出目录: {args['out']}")
145
+ os.mkdir(args['out'])
146
+
147
+ headers = requests.utils.default_headers()
148
+ headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
149
+
150
+ if args['pmcids'] != '%#$':
151
+ pmcids = args['pmcids'].split(",")
152
+ names = pmcids
153
+ else:
154
+ pmcids = [line.strip().split() for line in open(args['pmf'])]
155
+ if len(pmcids[0]) == 1:
156
+ pmcids = [x[0] for x in pmcids]
157
+ names = pmcids
158
+ else:
159
+ names = [x[1] for x in pmcids]
160
+ pmcids = [x[0] for x in pmcids]
161
+
162
+ finders = [
163
+ 'europe_pmc_service',
164
+ 'generic_citation_labelled',
165
+ 'pubmed_central_v2',
166
+ 'acs_publications',
167
+ 'uchicago_press',
168
+ 'nejm',
169
+ 'future_medicine',
170
+ 'science_direct'
171
+ ]
172
+
173
+ batch_count = 0
174
+ with open(args['errors'], 'w+') as error_pmids:
175
+ for pmcid, name in zip(pmcids, names):
176
+ print(f"Trying to fetch PMCID {pmcid.strip()}")
177
+ retries_so_far = 0
178
+ while retries_so_far < args['maxRetries']:
179
+ try:
180
+ fetch(pmcid, finders, name, headers, error_pmids, args)
181
+ retries_so_far = args['maxRetries']
182
+ except requests.ConnectionError as e:
183
+ if '104' in str(e):
184
+ retries_so_far += 1
185
+ if retries_so_far < args['maxRetries']:
186
+ print(f"** Retry {retries_so_far}/{args['maxRetries']} for {pmcid} due to error {e}")
187
+ else:
188
+ print(f"** Max retries reached for {pmcid}")
189
+ error_pmids.write(f"{pmcid}\t{name}\n")
190
+ else:
191
+ print(f"** Connection error for {pmcid}: {e}")
192
+ retries_so_far = args['maxRetries']
193
+ error_pmids.write(f"{pmcid}\t{name}\n")
194
+ except Exception as e:
195
+ print(f"** General error for {pmcid}: {e}")
196
+ retries_so_far = args['maxRetries']
197
+ error_pmids.write(f"{pmcid}\t{name}\n")
198
+ batch_count += 1
199
+ if batch_count % args['batch'] == 0:
200
+ print(f"** Batch limit reached. Sleeping for {args['delay']} seconds...")
201
+ time.sleep(args['delay'])
202
+
203
+
204
+
205
+
206
+
207
+
208
+
209
+
210
+
211
+
212
+
213
+
214
+
215
+