Spaces:

PetraV77
/

TRUST_score

Running

App Files Files Community

Petra Vidnerova commited on Feb 15

Commit

a0da684

1 Parent(s): f3b88b8

preproducibility fix

Browse files

Files changed (2) hide show

calculate_scores.py +7 -4
utils/score.py +40 -25

calculate_scores.py CHANGED Viewed

@@ -17,16 +17,19 @@ logger.addHandler(handler)
 logger.propagate = False
 @click.command()
-@click.argument("filename", type=click.Path(exists=True), default="data/challenge_data.csv")
 @click.option("--log-level", type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR"]),
               default="INFO")
 @click.option("--use-api-key", is_flag=True, default=False)
 @click.option("--force-cpu", is_flag=True, default=False)
 @click.option("--only-cached", is_flag=True, default=False, help="Only evaluate papers that have cached data available, skip others.")
 @click.option("--batch-size", default=4)
-def main(filename, log_level, use_api_key, force_cpu, only_cached, batch_size):
-    result_backup = "data/challenge_scores.pickle"
-    result_filename = "data/challenge_scores_final.csv"
     id_string = "OpenAlexID (as URL)"
     paper_id = "PaperProjectID"
     title_string = "Title"

 logger.propagate = False
 @click.command()
+@click.argument("filename", type=click.Path(exists=True),
+                default="data/Metadata file COMBINED.csv")
+@click.option("--output", type=click.Path(),
+              default="data/challenge_scores.pickle",
+              help="Output pickle file to save the raw partial scores.")
 @click.option("--log-level", type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR"]),
               default="INFO")
 @click.option("--use-api-key", is_flag=True, default=False)
 @click.option("--force-cpu", is_flag=True, default=False)
 @click.option("--only-cached", is_flag=True, default=False, help="Only evaluate papers that have cached data available, skip others.")
 @click.option("--batch-size", default=4)
+def main(filename, output, log_level, use_api_key, force_cpu, only_cached, batch_size):
+    result_backup = output
     id_string = "OpenAlexID (as URL)"
     paper_id = "PaperProjectID"
     title_string = "Title"

utils/score.py CHANGED Viewed

@@ -29,6 +29,9 @@ class Paper():
         self.status = "OK"
         self.titles_only = False
 class Score():
@@ -207,6 +210,7 @@ class Evaluator():
         """
         if self.online and paper.openalexid in self.ref_data_cache:
             # this mean it is challenge paper and we need reproduciblity
             paper.title = self.titles_cache.get(paper.openalexid, None)
             paper.abstract = self.abstracts_cache.get(paper.openalexid, None)
             paper.references = self.ref_data_cache.get(paper.openalexid, [])
@@ -262,6 +266,7 @@ class Evaluator():
         return paper
     def fetch_ref_data_batched(self, paper:Paper):
         if not self.api_key:
             yield from self.fetch_ref_data(paper)
             return
@@ -276,31 +281,34 @@ class Evaluator():
                 paper.ref_data.append((title, abstract))
             else:
                 to_process.append(ref)
-        # now process in batches
-        for i in range(0, len(to_process), batch_size):
-            works = to_process[i:i+batch_size]
-            works = [eat_prefix(w) for w in works]
-            url = "https://api.openalex.org/works"
-            params = {
-                "api_key": self.api_key,
-                "filter": "openalex:" + "|".join(works),
-                "select": "id,title,abstract_inverted_index"
-            }
-            data = send_request(url, params, 10, only_cached=self.only_cached)
-            if data is None:
-                raise ValueError("Error during batched fetching of reference data.")
-            for item in data["results"]:
-                openalexid = eat_prefix(item["id"])
-                title = item.get("title", None)
-                abstract = item.get("abstract_inverted_index", None)
-                abstract = create_abstract(abstract)
-                if title is None:
-                    logger.warning(f"Title not found for reference {openalexid}. Skipping this reference.")
-                    continue
-                self.titles_cache[openalexid] = title
-                self.abstracts_cache[openalexid] = abstract
-                paper.ref_data.append((title, abstract))
         yield self.check_ref_data(paper)
         return
@@ -339,16 +347,23 @@ class Evaluator():
             if ref in self.titles_cache:
                 title = self.titles_cache[ref]
             else:
                 select_fields.append("title")
             # abstract
             if not paper.titles_only:
                 if ref in self.abstracts_cache:
                     abstract = self.abstracts_cache[ref]
                 else:
                     select_fields.append("abstract_inverted_index")
             else:
                 abstract = None
             if not select_fields and title is not None:
                 paper.ref_data.append((title, abstract))
                 continue

         self.status = "OK"
         self.titles_only = False
+        # if challenge paper, we want to have reproducible results,
+        # so we use only cached data
+        self.challenge_paper = False
 class Score():
         """
         if self.online and paper.openalexid in self.ref_data_cache:
             # this mean it is challenge paper and we need reproduciblity
+            paper.challenge_paper = True
             paper.title = self.titles_cache.get(paper.openalexid, None)
             paper.abstract = self.abstracts_cache.get(paper.openalexid, None)
             paper.references = self.ref_data_cache.get(paper.openalexid, [])
         return paper
     def fetch_ref_data_batched(self, paper:Paper):
         if not self.api_key:
             yield from self.fetch_ref_data(paper)
             return
                 paper.ref_data.append((title, abstract))
             else:
                 to_process.append(ref)
+        if not (self.online and paper.challenge_paper):
+            # now process in batches
+            for i in range(0, len(to_process), batch_size):
+                works = to_process[i:i+batch_size]
+                works = [eat_prefix(w) for w in works]
+                url = "https://api.openalex.org/works"
+                params = {
+                    "api_key": self.api_key,
+                    "filter": "openalex:" + "|".join(works),
+                    "select": "id,title,abstract_inverted_index"
+                }
+                data = send_request(url, params, 10, only_cached=self.only_cached)
+                if data is None:
+                    raise ValueError("Error during batched fetching of reference data.")
+                for item in data["results"]:
+                    openalexid = eat_prefix(item["id"])
+                    title = item.get("title", None)
+                    abstract = item.get("abstract_inverted_index", None)
+                    abstract = create_abstract(abstract)
+                    if title is None:
+                        logger.warning(f"Title not found for reference {openalexid}. Skipping this reference.")
+                        continue
+                    self.titles_cache[openalexid] = title
+                    self.abstracts_cache[openalexid] = abstract
+                    paper.ref_data.append((title, abstract))
         yield self.check_ref_data(paper)
         return
             if ref in self.titles_cache:
                 title = self.titles_cache[ref]
             else:
+                title = None
                 select_fields.append("title")
             # abstract
             if not paper.titles_only:
                 if ref in self.abstracts_cache:
                     abstract = self.abstracts_cache[ref]
                 else:
+                    abstract = None
                     select_fields.append("abstract_inverted_index")
             else:
                 abstract = None
+            if self.online and paper.challenge_paper:
+                if title is not None:
+                    paper.ref_data.append((title, abstract))
+                continue
             if not select_fields and title is not None:
                 paper.ref_data.append((title, abstract))
                 continue