Spaces:
Running
Running
File size: 3,288 Bytes
609bc72 5c2ae19 53b0fe6 5c2ae19 609bc72 f3b88b8 609bc72 5c2ae19 f3b88b8 5c2ae19 9a7735f a912a76 9a7735f 609bc72 5c2ae19 a912a76 5c2ae19 b2c029c 5c2ae19 a912a76 5c2ae19 53b0fe6 38bfbd5 7c57a27 a912a76 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 | import os
import logging
import requests
import torch
from tenacity import (
retry, stop_after_attempt, retry_if_exception_type,
before_sleep_log, wait_random_exponential
)
logger = logging.getLogger("__main__")
@retry(
stop=stop_after_attempt(5),
wait=wait_random_exponential(multiplier=1, max=10),
retry=retry_if_exception_type(requests.exceptions.HTTPError),
before_sleep=before_sleep_log(logger, logging.WARNING),
retry_error_callback=lambda _: None
)
def send_request(url, params, timeout, only_cached=False):
if only_cached:
return {}
if "api_key" not in params:
params["api_key"] = os.getenv("OPENALEX_API_KEY", "")
# print(f"API_KEY={params['api_key']}")
params["mailto"] = "petra@cs.cas.cz"
response = requests.get(
url,
params=params,
timeout=timeout
)
if response.status_code == 404:
logger.warning(f"Data not found at {url}.")
return None
response.raise_for_status()
data = response.json()
#print(data)
return data
def eat_prefix(alexid):
PREFIX = "https://openalex.org/"
if alexid.startswith(PREFIX):
return alexid[len(PREFIX):]
else:
return alexid
def download_titles_and_abstracts(works):
for work_id in works:
data = download_paper_data(work_id, select="title,abstract_inverted_index")
if data is not None and "title" in data and "abstract" in data:
yield (data["title"], data["abstract"])
def download_paper_data(alexid, select: str, only_cached=False):
if only_cached:
return {}
base_url = "https://api.openalex.org/works/"
full_url = base_url + eat_prefix(alexid)
params = {
"select": select
}
timeout = 50
data = send_request(full_url, params, timeout)
if data is None:
return
if "abstract_inverted_index" in data:
if data["abstract_inverted_index"]:
data["abstract"] = create_abstract(data["abstract_inverted_index"])
del data["abstract_inverted_index"]
else:
data["abstract"] = None
del data["abstract_inverted_index"]
return data
def create_abstract(abstract_index):
if abstract_index is None:
return None
maximum = 0
for indexes in abstract_index.values():
m = max(indexes)
if m > maximum:
maximum = m
words = [""] * (maximum+1)
for w, indexes in abstract_index.items():
for i in indexes:
words[i] = w
return " ".join(words)
def calculate_score(paper_embedding, ref_embeddings):
if ref_embeddings.shape[0] == 0:
return 0.0
if paper_embedding is None:
paper_embedding = ref_embeddings.mean(axis=0).unsqueeze(0)
paper_normed = torch.nn.functional.normalize(paper_embedding, p=2, dim=1) # (1, n)
ref_normed = torch.nn.functional.normalize(ref_embeddings, p=2, dim=1) # (m, n)
cosine_sim_matrix = ref_normed @ paper_normed.T # (m, 1)
score = cosine_sim_matrix.flatten().mean().item()
# similarities = torch.nn.functional.cosine_similarity(
# paper_embedding,
# ref_embeddings,
# dim=1
# )
# score = similarities.mean().item()
return 1.0 - score
|