Petra Vidnerova commited on
Commit
a0da684
·
1 Parent(s): f3b88b8

preproducibility fix

Browse files
Files changed (2) hide show
  1. calculate_scores.py +7 -4
  2. utils/score.py +40 -25
calculate_scores.py CHANGED
@@ -17,16 +17,19 @@ logger.addHandler(handler)
17
  logger.propagate = False
18
 
19
  @click.command()
20
- @click.argument("filename", type=click.Path(exists=True), default="data/challenge_data.csv")
 
 
 
 
21
  @click.option("--log-level", type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR"]),
22
  default="INFO")
23
  @click.option("--use-api-key", is_flag=True, default=False)
24
  @click.option("--force-cpu", is_flag=True, default=False)
25
  @click.option("--only-cached", is_flag=True, default=False, help="Only evaluate papers that have cached data available, skip others.")
26
  @click.option("--batch-size", default=4)
27
- def main(filename, log_level, use_api_key, force_cpu, only_cached, batch_size):
28
- result_backup = "data/challenge_scores.pickle"
29
- result_filename = "data/challenge_scores_final.csv"
30
  id_string = "OpenAlexID (as URL)"
31
  paper_id = "PaperProjectID"
32
  title_string = "Title"
 
17
  logger.propagate = False
18
 
19
  @click.command()
20
+ @click.argument("filename", type=click.Path(exists=True),
21
+ default="data/Metadata file COMBINED.csv")
22
+ @click.option("--output", type=click.Path(),
23
+ default="data/challenge_scores.pickle",
24
+ help="Output pickle file to save the raw partial scores.")
25
  @click.option("--log-level", type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR"]),
26
  default="INFO")
27
  @click.option("--use-api-key", is_flag=True, default=False)
28
  @click.option("--force-cpu", is_flag=True, default=False)
29
  @click.option("--only-cached", is_flag=True, default=False, help="Only evaluate papers that have cached data available, skip others.")
30
  @click.option("--batch-size", default=4)
31
+ def main(filename, output, log_level, use_api_key, force_cpu, only_cached, batch_size):
32
+ result_backup = output
 
33
  id_string = "OpenAlexID (as URL)"
34
  paper_id = "PaperProjectID"
35
  title_string = "Title"
utils/score.py CHANGED
@@ -29,6 +29,9 @@ class Paper():
29
 
30
  self.status = "OK"
31
  self.titles_only = False
 
 
 
32
 
33
 
34
  class Score():
@@ -207,6 +210,7 @@ class Evaluator():
207
  """
208
  if self.online and paper.openalexid in self.ref_data_cache:
209
  # this mean it is challenge paper and we need reproduciblity
 
210
  paper.title = self.titles_cache.get(paper.openalexid, None)
211
  paper.abstract = self.abstracts_cache.get(paper.openalexid, None)
212
  paper.references = self.ref_data_cache.get(paper.openalexid, [])
@@ -262,6 +266,7 @@ class Evaluator():
262
  return paper
263
 
264
  def fetch_ref_data_batched(self, paper:Paper):
 
265
  if not self.api_key:
266
  yield from self.fetch_ref_data(paper)
267
  return
@@ -276,31 +281,34 @@ class Evaluator():
276
  paper.ref_data.append((title, abstract))
277
  else:
278
  to_process.append(ref)
279
- # now process in batches
280
- for i in range(0, len(to_process), batch_size):
281
- works = to_process[i:i+batch_size]
282
- works = [eat_prefix(w) for w in works]
283
-
284
- url = "https://api.openalex.org/works"
285
- params = {
286
- "api_key": self.api_key,
287
- "filter": "openalex:" + "|".join(works),
288
- "select": "id,title,abstract_inverted_index"
289
- }
290
- data = send_request(url, params, 10, only_cached=self.only_cached)
291
- if data is None:
292
- raise ValueError("Error during batched fetching of reference data.")
293
- for item in data["results"]:
294
- openalexid = eat_prefix(item["id"])
295
- title = item.get("title", None)
296
- abstract = item.get("abstract_inverted_index", None)
297
- abstract = create_abstract(abstract)
298
- if title is None:
299
- logger.warning(f"Title not found for reference {openalexid}. Skipping this reference.")
300
- continue
301
- self.titles_cache[openalexid] = title
302
- self.abstracts_cache[openalexid] = abstract
303
- paper.ref_data.append((title, abstract))
 
 
 
304
 
305
  yield self.check_ref_data(paper)
306
  return
@@ -339,16 +347,23 @@ class Evaluator():
339
  if ref in self.titles_cache:
340
  title = self.titles_cache[ref]
341
  else:
 
342
  select_fields.append("title")
343
  # abstract
344
  if not paper.titles_only:
345
  if ref in self.abstracts_cache:
346
  abstract = self.abstracts_cache[ref]
347
  else:
 
348
  select_fields.append("abstract_inverted_index")
349
  else:
350
  abstract = None
351
 
 
 
 
 
 
352
  if not select_fields and title is not None:
353
  paper.ref_data.append((title, abstract))
354
  continue
 
29
 
30
  self.status = "OK"
31
  self.titles_only = False
32
+ # if challenge paper, we want to have reproducible results,
33
+ # so we use only cached data
34
+ self.challenge_paper = False
35
 
36
 
37
  class Score():
 
210
  """
211
  if self.online and paper.openalexid in self.ref_data_cache:
212
  # this mean it is challenge paper and we need reproduciblity
213
+ paper.challenge_paper = True
214
  paper.title = self.titles_cache.get(paper.openalexid, None)
215
  paper.abstract = self.abstracts_cache.get(paper.openalexid, None)
216
  paper.references = self.ref_data_cache.get(paper.openalexid, [])
 
266
  return paper
267
 
268
  def fetch_ref_data_batched(self, paper:Paper):
269
+
270
  if not self.api_key:
271
  yield from self.fetch_ref_data(paper)
272
  return
 
281
  paper.ref_data.append((title, abstract))
282
  else:
283
  to_process.append(ref)
284
+
285
+ if not (self.online and paper.challenge_paper):
286
+
287
+ # now process in batches
288
+ for i in range(0, len(to_process), batch_size):
289
+ works = to_process[i:i+batch_size]
290
+ works = [eat_prefix(w) for w in works]
291
+
292
+ url = "https://api.openalex.org/works"
293
+ params = {
294
+ "api_key": self.api_key,
295
+ "filter": "openalex:" + "|".join(works),
296
+ "select": "id,title,abstract_inverted_index"
297
+ }
298
+ data = send_request(url, params, 10, only_cached=self.only_cached)
299
+ if data is None:
300
+ raise ValueError("Error during batched fetching of reference data.")
301
+ for item in data["results"]:
302
+ openalexid = eat_prefix(item["id"])
303
+ title = item.get("title", None)
304
+ abstract = item.get("abstract_inverted_index", None)
305
+ abstract = create_abstract(abstract)
306
+ if title is None:
307
+ logger.warning(f"Title not found for reference {openalexid}. Skipping this reference.")
308
+ continue
309
+ self.titles_cache[openalexid] = title
310
+ self.abstracts_cache[openalexid] = abstract
311
+ paper.ref_data.append((title, abstract))
312
 
313
  yield self.check_ref_data(paper)
314
  return
 
347
  if ref in self.titles_cache:
348
  title = self.titles_cache[ref]
349
  else:
350
+ title = None
351
  select_fields.append("title")
352
  # abstract
353
  if not paper.titles_only:
354
  if ref in self.abstracts_cache:
355
  abstract = self.abstracts_cache[ref]
356
  else:
357
+ abstract = None
358
  select_fields.append("abstract_inverted_index")
359
  else:
360
  abstract = None
361
 
362
+ if self.online and paper.challenge_paper:
363
+ if title is not None:
364
+ paper.ref_data.append((title, abstract))
365
+ continue
366
+
367
  if not select_fields and title is not None:
368
  paper.ref_data.append((title, abstract))
369
  continue