File size: 3,288 Bytes
609bc72
 
5c2ae19
53b0fe6
5c2ae19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
609bc72
 
 
 
 
f3b88b8
609bc72
5c2ae19
 
 
 
 
 
 
 
 
 
 
f3b88b8
5c2ae19
 
 
 
 
 
 
 
 
9a7735f
 
 
a912a76
9a7735f
 
 
 
609bc72
 
 
5c2ae19
 
 
a912a76
5c2ae19
b2c029c
5c2ae19
 
a912a76
 
 
 
 
 
 
 
 
5c2ae19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53b0fe6
 
 
 
38bfbd5
 
7c57a27
 
 
 
 
 
 
 
 
 
 
 
 
a912a76
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os
import logging 
import requests
import torch

from tenacity import (
    retry, stop_after_attempt, retry_if_exception_type, 
    before_sleep_log, wait_random_exponential
)

logger = logging.getLogger("__main__")

@retry(
    stop=stop_after_attempt(5),
    wait=wait_random_exponential(multiplier=1, max=10),
    retry=retry_if_exception_type(requests.exceptions.HTTPError),
    before_sleep=before_sleep_log(logger, logging.WARNING),
    retry_error_callback=lambda _: None
)
def send_request(url, params, timeout, only_cached=False):
    if only_cached:
        return {}
    if "api_key" not in params:
        params["api_key"] = os.getenv("OPENALEX_API_KEY", "")
        #   print(f"API_KEY={params['api_key']}")
        
    params["mailto"] = "petra@cs.cas.cz"
    response = requests.get(
        url,
        params=params,
        timeout=timeout
    )
    if response.status_code == 404: 
        logger.warning(f"Data not found at {url}.")
        return None
    response.raise_for_status()
    data = response.json()
    #print(data)
    return data

def eat_prefix(alexid):
    PREFIX = "https://openalex.org/"
    if alexid.startswith(PREFIX):
        return alexid[len(PREFIX):]
    else:
        return alexid


def download_titles_and_abstracts(works):
    for work_id in works:
        data = download_paper_data(work_id, select="title,abstract_inverted_index")
        if data is not None and "title" in data and "abstract" in data:
            yield (data["title"], data["abstract"])


def download_paper_data(alexid, select: str, only_cached=False):
    if only_cached:
        return {}
    base_url = "https://api.openalex.org/works/"
    full_url = base_url + eat_prefix(alexid)
    params = {
        "select": select
    }
    timeout = 50
    data = send_request(full_url, params, timeout)
    if data is None:
        return 
    
    if "abstract_inverted_index" in data:
        if data["abstract_inverted_index"]:
            data["abstract"] = create_abstract(data["abstract_inverted_index"])
            del data["abstract_inverted_index"]
        else:
            data["abstract"] = None
            del data["abstract_inverted_index"]
    return data

def create_abstract(abstract_index):
    if abstract_index is None:
        return None
    maximum = 0
    for indexes in abstract_index.values():
        m = max(indexes)
        if m > maximum:
            maximum = m
    words = [""] * (maximum+1)
    for w, indexes in abstract_index.items():
        for i in indexes:
            words[i] = w
    return " ".join(words)

def calculate_score(paper_embedding, ref_embeddings):
    if ref_embeddings.shape[0] == 0:
        return 0.0
    if paper_embedding is None:
        paper_embedding = ref_embeddings.mean(axis=0).unsqueeze(0)
    
    paper_normed = torch.nn.functional.normalize(paper_embedding, p=2, dim=1)   # (1, n)
    ref_normed = torch.nn.functional.normalize(ref_embeddings, p=2, dim=1)   # (m, n)

    cosine_sim_matrix = ref_normed @ paper_normed.T   # (m, 1)
    score = cosine_sim_matrix.flatten().mean().item()

    # similarities = torch.nn.functional.cosine_similarity(
    #     paper_embedding,
    #     ref_embeddings,
    #     dim=1
    # )
    # score = similarities.mean().item()
    return 1.0 - score