Petra Vidnerova commited on
Commit
5c2ae19
·
1 Parent(s): 451bfc6
Files changed (1) hide show
  1. utils.py +66 -0
utils.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import requests
3
+ import time
4
+
5
+ from tenacity import (
6
+ retry, stop_after_attempt, retry_if_exception_type,
7
+ before_sleep_log, wait_random_exponential
8
+ )
9
+
10
+ logger = logging.getLogger("__main__")
11
+
12
+ @retry(
13
+ stop=stop_after_attempt(5),
14
+ wait=wait_random_exponential(multiplier=1, max=10),
15
+ retry=retry_if_exception_type(requests.exceptions.HTTPError),
16
+ before_sleep=before_sleep_log(logger, logging.WARNING),
17
+ retry_error_callback=lambda _: None
18
+ )
19
+ def send_request(url, params, timeout):
20
+ params["mailto"] = "petra@cs.cas.cz"
21
+ response = requests.get(
22
+ url,
23
+ params=params,
24
+ timeout=timeout
25
+ )
26
+ if response.status_code == 404:
27
+ logger.warning(f"Data not found at {url}.")
28
+ return None
29
+ response.raise_for_status()
30
+ data = response.json()
31
+ return data
32
+
33
+ def eat_prefix(alexid):
34
+ PREFIX = "https://openalex.org/"
35
+ if alexid.startswith(PREFIX):
36
+ return alexid[len(PREFIX):]
37
+ else:
38
+ return alexid
39
+
40
+ def download_paper_data(alexid):
41
+ base_url = "https://api.openalex.org/works/"
42
+ full_url = base_url + eat_prefix(alexid)
43
+ params = {
44
+ "select": "title,abstract_inverted_index,referenced_works"
45
+ }
46
+ timeout = 10
47
+ data = send_request(full_url, params, timeout)
48
+ if data is None:
49
+ return {"error": "Error during fetching data for given OpenAlex ID."}
50
+ if data["abstract_inverted_index"] is not None:
51
+ data["abstract"] = create_abstract(data["abstract_inverted_index"])
52
+ return data
53
+
54
+ def create_abstract(abstract_index):
55
+ if abstract_index is None:
56
+ return None
57
+ maximum = 0
58
+ for indexes in abstract_index.values():
59
+ m = max(indexes)
60
+ if m > maximum:
61
+ maximum = m
62
+ words = [""] * (maximum+1)
63
+ for w, indexes in abstract_index.items():
64
+ for i in indexes:
65
+ words[i] = w
66
+ return " ".join(words)