Spaces:
Running
Running
Petra Vidnerova commited on
Commit ·
a0da684
1
Parent(s): f3b88b8
preproducibility fix
Browse files- calculate_scores.py +7 -4
- utils/score.py +40 -25
calculate_scores.py
CHANGED
|
@@ -17,16 +17,19 @@ logger.addHandler(handler)
|
|
| 17 |
logger.propagate = False
|
| 18 |
|
| 19 |
@click.command()
|
| 20 |
-
@click.argument("filename", type=click.Path(exists=True),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
@click.option("--log-level", type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR"]),
|
| 22 |
default="INFO")
|
| 23 |
@click.option("--use-api-key", is_flag=True, default=False)
|
| 24 |
@click.option("--force-cpu", is_flag=True, default=False)
|
| 25 |
@click.option("--only-cached", is_flag=True, default=False, help="Only evaluate papers that have cached data available, skip others.")
|
| 26 |
@click.option("--batch-size", default=4)
|
| 27 |
-
def main(filename, log_level, use_api_key, force_cpu, only_cached, batch_size):
|
| 28 |
-
result_backup =
|
| 29 |
-
result_filename = "data/challenge_scores_final.csv"
|
| 30 |
id_string = "OpenAlexID (as URL)"
|
| 31 |
paper_id = "PaperProjectID"
|
| 32 |
title_string = "Title"
|
|
|
|
| 17 |
logger.propagate = False
|
| 18 |
|
| 19 |
@click.command()
|
| 20 |
+
@click.argument("filename", type=click.Path(exists=True),
|
| 21 |
+
default="data/Metadata file COMBINED.csv")
|
| 22 |
+
@click.option("--output", type=click.Path(),
|
| 23 |
+
default="data/challenge_scores.pickle",
|
| 24 |
+
help="Output pickle file to save the raw partial scores.")
|
| 25 |
@click.option("--log-level", type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR"]),
|
| 26 |
default="INFO")
|
| 27 |
@click.option("--use-api-key", is_flag=True, default=False)
|
| 28 |
@click.option("--force-cpu", is_flag=True, default=False)
|
| 29 |
@click.option("--only-cached", is_flag=True, default=False, help="Only evaluate papers that have cached data available, skip others.")
|
| 30 |
@click.option("--batch-size", default=4)
|
| 31 |
+
def main(filename, output, log_level, use_api_key, force_cpu, only_cached, batch_size):
|
| 32 |
+
result_backup = output
|
|
|
|
| 33 |
id_string = "OpenAlexID (as URL)"
|
| 34 |
paper_id = "PaperProjectID"
|
| 35 |
title_string = "Title"
|
utils/score.py
CHANGED
|
@@ -29,6 +29,9 @@ class Paper():
|
|
| 29 |
|
| 30 |
self.status = "OK"
|
| 31 |
self.titles_only = False
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
|
| 34 |
class Score():
|
|
@@ -207,6 +210,7 @@ class Evaluator():
|
|
| 207 |
"""
|
| 208 |
if self.online and paper.openalexid in self.ref_data_cache:
|
| 209 |
# this mean it is challenge paper and we need reproduciblity
|
|
|
|
| 210 |
paper.title = self.titles_cache.get(paper.openalexid, None)
|
| 211 |
paper.abstract = self.abstracts_cache.get(paper.openalexid, None)
|
| 212 |
paper.references = self.ref_data_cache.get(paper.openalexid, [])
|
|
@@ -262,6 +266,7 @@ class Evaluator():
|
|
| 262 |
return paper
|
| 263 |
|
| 264 |
def fetch_ref_data_batched(self, paper:Paper):
|
|
|
|
| 265 |
if not self.api_key:
|
| 266 |
yield from self.fetch_ref_data(paper)
|
| 267 |
return
|
|
@@ -276,31 +281,34 @@ class Evaluator():
|
|
| 276 |
paper.ref_data.append((title, abstract))
|
| 277 |
else:
|
| 278 |
to_process.append(ref)
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
|
|
|
|
|
|
|
|
|
| 304 |
|
| 305 |
yield self.check_ref_data(paper)
|
| 306 |
return
|
|
@@ -339,16 +347,23 @@ class Evaluator():
|
|
| 339 |
if ref in self.titles_cache:
|
| 340 |
title = self.titles_cache[ref]
|
| 341 |
else:
|
|
|
|
| 342 |
select_fields.append("title")
|
| 343 |
# abstract
|
| 344 |
if not paper.titles_only:
|
| 345 |
if ref in self.abstracts_cache:
|
| 346 |
abstract = self.abstracts_cache[ref]
|
| 347 |
else:
|
|
|
|
| 348 |
select_fields.append("abstract_inverted_index")
|
| 349 |
else:
|
| 350 |
abstract = None
|
| 351 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 352 |
if not select_fields and title is not None:
|
| 353 |
paper.ref_data.append((title, abstract))
|
| 354 |
continue
|
|
|
|
| 29 |
|
| 30 |
self.status = "OK"
|
| 31 |
self.titles_only = False
|
| 32 |
+
# if challenge paper, we want to have reproducible results,
|
| 33 |
+
# so we use only cached data
|
| 34 |
+
self.challenge_paper = False
|
| 35 |
|
| 36 |
|
| 37 |
class Score():
|
|
|
|
| 210 |
"""
|
| 211 |
if self.online and paper.openalexid in self.ref_data_cache:
|
| 212 |
# this mean it is challenge paper and we need reproduciblity
|
| 213 |
+
paper.challenge_paper = True
|
| 214 |
paper.title = self.titles_cache.get(paper.openalexid, None)
|
| 215 |
paper.abstract = self.abstracts_cache.get(paper.openalexid, None)
|
| 216 |
paper.references = self.ref_data_cache.get(paper.openalexid, [])
|
|
|
|
| 266 |
return paper
|
| 267 |
|
| 268 |
def fetch_ref_data_batched(self, paper:Paper):
|
| 269 |
+
|
| 270 |
if not self.api_key:
|
| 271 |
yield from self.fetch_ref_data(paper)
|
| 272 |
return
|
|
|
|
| 281 |
paper.ref_data.append((title, abstract))
|
| 282 |
else:
|
| 283 |
to_process.append(ref)
|
| 284 |
+
|
| 285 |
+
if not (self.online and paper.challenge_paper):
|
| 286 |
+
|
| 287 |
+
# now process in batches
|
| 288 |
+
for i in range(0, len(to_process), batch_size):
|
| 289 |
+
works = to_process[i:i+batch_size]
|
| 290 |
+
works = [eat_prefix(w) for w in works]
|
| 291 |
+
|
| 292 |
+
url = "https://api.openalex.org/works"
|
| 293 |
+
params = {
|
| 294 |
+
"api_key": self.api_key,
|
| 295 |
+
"filter": "openalex:" + "|".join(works),
|
| 296 |
+
"select": "id,title,abstract_inverted_index"
|
| 297 |
+
}
|
| 298 |
+
data = send_request(url, params, 10, only_cached=self.only_cached)
|
| 299 |
+
if data is None:
|
| 300 |
+
raise ValueError("Error during batched fetching of reference data.")
|
| 301 |
+
for item in data["results"]:
|
| 302 |
+
openalexid = eat_prefix(item["id"])
|
| 303 |
+
title = item.get("title", None)
|
| 304 |
+
abstract = item.get("abstract_inverted_index", None)
|
| 305 |
+
abstract = create_abstract(abstract)
|
| 306 |
+
if title is None:
|
| 307 |
+
logger.warning(f"Title not found for reference {openalexid}. Skipping this reference.")
|
| 308 |
+
continue
|
| 309 |
+
self.titles_cache[openalexid] = title
|
| 310 |
+
self.abstracts_cache[openalexid] = abstract
|
| 311 |
+
paper.ref_data.append((title, abstract))
|
| 312 |
|
| 313 |
yield self.check_ref_data(paper)
|
| 314 |
return
|
|
|
|
| 347 |
if ref in self.titles_cache:
|
| 348 |
title = self.titles_cache[ref]
|
| 349 |
else:
|
| 350 |
+
title = None
|
| 351 |
select_fields.append("title")
|
| 352 |
# abstract
|
| 353 |
if not paper.titles_only:
|
| 354 |
if ref in self.abstracts_cache:
|
| 355 |
abstract = self.abstracts_cache[ref]
|
| 356 |
else:
|
| 357 |
+
abstract = None
|
| 358 |
select_fields.append("abstract_inverted_index")
|
| 359 |
else:
|
| 360 |
abstract = None
|
| 361 |
|
| 362 |
+
if self.online and paper.challenge_paper:
|
| 363 |
+
if title is not None:
|
| 364 |
+
paper.ref_data.append((title, abstract))
|
| 365 |
+
continue
|
| 366 |
+
|
| 367 |
if not select_fields and title is not None:
|
| 368 |
paper.ref_data.append((title, abstract))
|
| 369 |
continue
|