Spaces:

Autonomous-Scientific-Agents
/

chemgraph-leaderboard

Running

App Files Files Community

Thang Pham commited on Oct 23, 2025

Commit

2f1d640

1 Parent(s): 3410bfc

Update task names and leaderboard display

Browse files

Files changed (5) hide show

app.py +33 -8
src/about.py +32 -2
src/display/utils.py +49 -11
src/leaderboard/read_evals.py +31 -20
src/populate.py +13 -4

app.py CHANGED Viewed

@@ -22,9 +22,17 @@ from src.display.utils import (
     ModelType,
     fields,
     WeightType,
-    Precision
 )
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
@@ -32,24 +40,35 @@ from src.submission.submit import add_new_eval
 def restart_space():
     API.restart_space(repo_id=REPO_ID)
 ### Space initialisation
 try:
     print(EVAL_REQUESTS_PATH)
     snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
 except Exception:
     restart_space()
 try:
     print(EVAL_RESULTS_PATH)
     snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
 except Exception:
     restart_space()
 LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 (
     finished_eval_queue_df,
@@ -57,6 +76,7 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
     pending_eval_queue_df,
 ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 def init_leaderboard(dataframe):
     if dataframe is None or dataframe.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
@@ -81,7 +101,10 @@ def init_leaderboard(dataframe):
                 label="Select the number of parameters (B)",
             ),
             ColumnFilter(
-                AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
             ),
         ],
         bool_checkboxgroup_label="Hide models",
@@ -171,7 +194,9 @@ with demo:
                         value="Original",
                         interactive=True,
                     )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
             submit_button = gr.Button("Submit Eval")
             submission_result = gr.Markdown()
@@ -201,4 +226,4 @@ with demo:
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
-demo.queue(default_concurrency_limit=40).launch()

     ModelType,
     fields,
     WeightType,
+    Precision,
+)
+from src.envs import (
+    API,
+    EVAL_REQUESTS_PATH,
+    EVAL_RESULTS_PATH,
+    QUEUE_REPO,
+    REPO_ID,
+    RESULTS_REPO,
+    TOKEN,
 )
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
 def restart_space():
     API.restart_space(repo_id=REPO_ID)
 ### Space initialisation
 try:
     print(EVAL_REQUESTS_PATH)
     snapshot_download(
+        repo_id=QUEUE_REPO,
+        local_dir=EVAL_REQUESTS_PATH,
+        repo_type="dataset",
+        tqdm_class=None,
+        etag_timeout=30,
+        token=TOKEN,
     )
 except Exception:
     restart_space()
 try:
     print(EVAL_RESULTS_PATH)
     snapshot_download(
+        repo_id=RESULTS_REPO,
+        local_dir=EVAL_RESULTS_PATH,
+        repo_type="dataset",
+        tqdm_class=None,
+        etag_timeout=30,
+        token=TOKEN,
     )
 except Exception:
     restart_space()
 LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
+LEADERBOARD_DF["T"] = range(1, len(LEADERBOARD_DF) + 1)
 (
     finished_eval_queue_df,
     pending_eval_queue_df,
 ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 def init_leaderboard(dataframe):
     if dataframe is None or dataframe.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
                 label="Select the number of parameters (B)",
             ),
             ColumnFilter(
+                AutoEvalColumn.still_on_hub.name,
+                type="boolean",
+                label="Deleted/incomplete",
+                default=False,
             ),
         ],
         bool_checkboxgroup_label="Hide models",
                         value="Original",
                         interactive=True,
                     )
+                    base_model_name_textbox = gr.Textbox(
+                        label="Base model (for delta or adapter weights)"
+                    )
             submit_button = gr.Button("Submit Eval")
             submission_result = gr.Markdown()
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
+demo.queue(default_concurrency_limit=40).launch()

src/about.py CHANGED Viewed

@@ -15,6 +15,19 @@ class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
     task0 = Task("exp1", "accuracy", "name2smi")
     task1 = Task("exp2", "accuracy", "name2coord")
 NUM_FEWSHOT = 0  # Change with your few shot
@@ -22,11 +35,21 @@ NUM_FEWSHOT = 0  # Change with your few shot
 # Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-Intro text
 """
 # Which evaluations are you running? how can people reproduce what you have?
@@ -70,4 +93,11 @@ If everything is done, check you can launch the EleutherAIHarness on your model
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""
 """

     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
     task0 = Task("exp1", "accuracy", "name2smi")
     task1 = Task("exp2", "accuracy", "name2coord")
+    task2 = Task("exp3", "accuracy", "name2opt")
+    task3 = Task("exp4", "accuracy", "name2vib")
+    task4 = Task("exp5", "accuracy", "name2gibbs")
+    task5 = Task("exp6", "accuracy", "name2file")
+    task6 = Task("exp7", "accuracy", "smi2coord")
+    task7 = Task("exp8", "accuracy", "smi2opt")
+    task8 = Task("exp9", "accuracy", "smi2vib")
+    task9 = Task("exp10", "accuracy", "smi2gibbs")
+    task10 = Task("exp11", "accuracy", "smi2file")
+    task11 = Task("exp12", "accuracy", "react2enthalpy")
+    task12 = Task("exp13", "accuracy", "react2gibbs")
+    task13 = Task("exp14", "accuracy", "react2enthalpy_multiagent")
+    task14 = Task("exp15", "accuracy", "react2gibbs_multiagent")
 NUM_FEWSHOT = 0  # Change with your few shot
 # Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">ChemGraph Leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+ChemGraph Leaderboard provides a reproducible evaluation of **agentic AI frameworks and large language models (LLMs)** for computational chemistry and materials science.
+This leaderboard benchmarks models on a diverse set of tasks, including:
+- Molecular geometry optimization, vibration analysis, and thermochemistry estimation.
+- Reaction thermodynamics prediction (enthalpy, Gibbs free energy)  .
+- Tool-usage accuracy in multi-agent workflows.
+Each model’s score reflects its ability to **follow structured tool protocols, generate physically meaningful results, and reason across chemistry-specific contexts**.
+The benchmark results are generated offline and uploaded as part of the [**ChemGraph paper**](https://arxiv.org/abs/2506.06363).
+Use this leaderboard to explore how different models and agents perform across core chemistry tasks, from small-molecule modeling to multi-step reaction workflows.
 """
 # Which evaluations are you running? how can people reproduce what you have?
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""
+@article{pham2025chemgraph,
+title={ChemGraph: An Agentic Framework for Computational Chemistry Workflows},
+author={Pham, Thang D and Tanikanti, Aditya and Keçeli, Murat},
+journal={arXiv preprint arXiv:2506.06363},
+year={2025}
+url={https://arxiv.org/abs/2506.06363}
+}
 """

src/display/utils.py CHANGED Viewed

@@ -5,6 +5,7 @@ import pandas as pd
 from src.about import Tasks
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -20,29 +21,63 @@ class ColumnContent:
     hidden: bool = False
     never_hidden: bool = False
 ## Leaderboard columns
 auto_eval_column_dict = []
 # Init
-auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
-auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
-#Scores
 auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
-    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
-auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
-auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
 auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
 auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
-auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
 auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
-auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
-auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 # We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 ## For the queue columns in the submission tab
 @dataclass(frozen=True)
 class EvalQueueColumn:  # Queue column
@@ -53,12 +88,13 @@ class EvalQueueColumn:  # Queue column
     weight_type = ColumnContent("weight_type", "str", "Original")
     status = ColumnContent("status", "str", True)
 ## All the model information that we might need
 @dataclass
 class ModelDetails:
     name: str
     display_name: str = ""
-    symbol: str = "" # emoji
 class ModelType(Enum):
@@ -83,11 +119,13 @@ class ModelType(Enum):
             return ModelType.IFT
         return ModelType.Unknown
 class WeightType(Enum):
     Adapter = ModelDetails("Adapter")
     Original = ModelDetails("Original")
     Delta = ModelDetails("Delta")
 class Precision(Enum):
     float16 = ModelDetails("float16")
     bfloat16 = ModelDetails("bfloat16")
@@ -100,6 +138,7 @@ class Precision(Enum):
             return Precision.bfloat16
         return Precision.Unknown
 # Column selection
 COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
@@ -107,4 +146,3 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 BENCHMARK_COLS = [t.value.col_name for t in Tasks]

 from src.about import Tasks
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
     hidden: bool = False
     never_hidden: bool = False
 ## Leaderboard columns
 auto_eval_column_dict = []
 # Init
+auto_eval_column_dict.append([
+    "model_type_symbol",
+    ColumnContent,
+    ColumnContent("T", "str", True, never_hidden=True),
+])
+auto_eval_column_dict.append([
+    "model",
+    ColumnContent,
+    ColumnContent("Model", "markdown", True, never_hidden=True),
+])
+# Scores
 auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
+    auto_eval_column_dict.append([
+        task.name,
+        ColumnContent,
+        ColumnContent(task.value.col_name, "number", True),
+    ])
 # Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
+auto_eval_column_dict.append([
+    "architecture",
+    ColumnContent,
+    ColumnContent("Architecture", "str", False),
+])
+auto_eval_column_dict.append([
+    "weight_type",
+    ColumnContent,
+    ColumnContent("Weight type", "str", False, True),
+])
 auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
 auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
+auto_eval_column_dict.append([
+    "params",
+    ColumnContent,
+    ColumnContent("#Params (B)", "number", False),
+])
 auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
+auto_eval_column_dict.append([
+    "still_on_hub",
+    ColumnContent,
+    ColumnContent("Available on the hub", "bool", False),
+])
+auto_eval_column_dict.append([
+    "revision",
+    ColumnContent,
+    ColumnContent("Model sha", "str", False, False),
+])
 # We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 ## For the queue columns in the submission tab
 @dataclass(frozen=True)
 class EvalQueueColumn:  # Queue column
     weight_type = ColumnContent("weight_type", "str", "Original")
     status = ColumnContent("status", "str", True)
 ## All the model information that we might need
 @dataclass
 class ModelDetails:
     name: str
     display_name: str = ""
+    symbol: str = ""  # emoji
 class ModelType(Enum):
             return ModelType.IFT
         return ModelType.Unknown
 class WeightType(Enum):
     Adapter = ModelDetails("Adapter")
     Original = ModelDetails("Original")
     Delta = ModelDetails("Delta")
 class Precision(Enum):
     float16 = ModelDetails("float16")
     bfloat16 = ModelDetails("bfloat16")
             return Precision.bfloat16
         return Precision.Unknown
 # Column selection
 COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 BENCHMARK_COLS = [t.value.col_name for t in Tasks]

src/leaderboard/read_evals.py CHANGED Viewed

@@ -14,22 +14,22 @@ from src.submission.check_validity import is_model_on_hub
 @dataclass
 class EvalResult:
-    """Represents one full evaluation. Built from a combination of the result and request file for a given run.
-    """
-    eval_name: str # org_model_precision (uid)
-    full_model: str # org/model (path on hub)
-    org: str
     model: str
-    revision: str # commit hash, "" if main
     results: dict
     precision: Precision = Precision.Unknown
-    model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
-    weight_type: WeightType = WeightType.Original # Original or Adapter
-    architecture: str = "Unknown"
     license: str = "?"
     likes: int = 0
     num_params: int = 0
-    date: str = "" # submission date of request file
     still_on_hub: bool = False
     @classmethod
@@ -58,7 +58,10 @@ class EvalResult:
         full_model = "/".join(org_and_model)
         still_on_hub, _, model_config = is_model_on_hub(
-            full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
         )
         architecture = "?"
         if model_config is not None:
@@ -72,7 +75,9 @@ class EvalResult:
             task = task.value
             # We average all scores of a given metric (not all metrics are present in all files)
-            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
@@ -85,15 +90,17 @@ class EvalResult:
             org=org,
             model=model,
             results=results,
-            precision=precision,
-            revision= config.get("model_sha", ""),
             still_on_hub=still_on_hub,
-            architecture=architecture
         )
     def update_with_request_file(self, requests_path):
         """Finds the relevant request file for the current model and updates info with it"""
-        request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
         try:
             with open(request_file, "r") as f:
@@ -105,7 +112,9 @@ class EvalResult:
             self.num_params = request.get("params", 0)
             self.date = request.get("submitted_time", "")
         except Exception:
-            print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
@@ -171,8 +180,8 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
         for file in files:
             model_result_filepaths.append(os.path.join(root, file))
     eval_results = {}
     for model_result_filepath in model_result_filepaths:
         # Creation of result
         eval_result = EvalResult.init_from_json_file(model_result_filepath)
@@ -181,14 +190,16 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
         # Store results of same eval together
         eval_name = eval_result.eval_name
         if eval_name in eval_results.keys():
-            eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
         else:
             eval_results[eval_name] = eval_result
     results = []
     for v in eval_results.values():
         try:
-            v.to_dict() # we test if the dict version is complete
             results.append(v)
         except KeyError:  # not all eval values present
             continue

 @dataclass
 class EvalResult:
+    """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
+    eval_name: str  # org_model_precision (uid)
+    full_model: str  # org/model (path on hub)
+    org: str
     model: str
+    revision: str  # commit hash, "" if main
     results: dict
     precision: Precision = Precision.Unknown
+    model_type: ModelType = ModelType.Unknown  # Pretrained, fine tuned, ...
+    weight_type: WeightType = WeightType.Original  # Original or Adapter
+    architecture: str = "Unknown"
     license: str = "?"
     likes: int = 0
     num_params: int = 0
+    date: str = ""  # submission date of request file
     still_on_hub: bool = False
     @classmethod
         full_model = "/".join(org_and_model)
         still_on_hub, _, model_config = is_model_on_hub(
+            full_model,
+            config.get("model_sha", "main"),
+            trust_remote_code=True,
+            test_tokenizer=False,
         )
         architecture = "?"
         if model_config is not None:
             task = task.value
             # We average all scores of a given metric (not all metrics are present in all files)
+            accs = np.array([
+                v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k
+            ])
             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
             org=org,
             model=model,
             results=results,
+            precision=precision,
+            revision=config.get("model_sha", ""),
             still_on_hub=still_on_hub,
+            architecture=architecture,
         )
     def update_with_request_file(self, requests_path):
         """Finds the relevant request file for the current model and updates info with it"""
+        request_file = get_request_file_for_model(
+            requests_path, self.full_model, self.precision.value.name
+        )
         try:
             with open(request_file, "r") as f:
             self.num_params = request.get("params", 0)
             self.date = request.get("submitted_time", "")
         except Exception:
+            print(
+                f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}"
+            )
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
         for file in files:
             model_result_filepaths.append(os.path.join(root, file))
     eval_results = {}
+    print(f"MODEL FILE PATHS: {model_result_filepaths}")
     for model_result_filepath in model_result_filepaths:
         # Creation of result
         eval_result = EvalResult.init_from_json_file(model_result_filepath)
         # Store results of same eval together
         eval_name = eval_result.eval_name
         if eval_name in eval_results.keys():
+            eval_results[eval_name].results.update({
+                k: v for k, v in eval_result.results.items() if v is not None
+            })
         else:
             eval_results[eval_name] = eval_result
     results = []
     for v in eval_results.values():
         try:
+            v.to_dict()  # we test if the dict version is complete
             results.append(v)
         except KeyError:  # not all eval values present
             continue

src/populate.py CHANGED Viewed

@@ -8,11 +8,12 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn
 from src.leaderboard.read_evals import get_raw_eval_results
-def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
     raw_data = get_raw_eval_results(results_path, requests_path)
     all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
     df = df[cols].round(decimals=2)
@@ -39,7 +40,11 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
             all_evals.append(data)
         elif ".md" not in entry:
             # this is a folder
-            sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
             for sub_entry in sub_entries:
                 file_path = os.path.join(save_path, entry, sub_entry)
                 with open(file_path) as fp:
@@ -51,7 +56,11 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
     pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
     running_list = [e for e in all_evals if e["status"] == "RUNNING"]
-    finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
     df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
     df_running = pd.DataFrame.from_records(running_list, columns=cols)
     df_finished = pd.DataFrame.from_records(finished_list, columns=cols)

 from src.leaderboard.read_evals import get_raw_eval_results
+def get_leaderboard_df(
+    results_path: str, requests_path: str, cols: list, benchmark_cols: list
+) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
     raw_data = get_raw_eval_results(results_path, requests_path)
     all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
     df = df[cols].round(decimals=2)
             all_evals.append(data)
         elif ".md" not in entry:
             # this is a folder
+            sub_entries = [
+                e
+                for e in os.listdir(f"{save_path}/{entry}")
+                if os.path.isfile(e) and not e.startswith(".")
+            ]
             for sub_entry in sub_entries:
                 file_path = os.path.join(save_path, entry, sub_entry)
                 with open(file_path) as fp:
     pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
     running_list = [e for e in all_evals if e["status"] == "RUNNING"]
+    finished_list = [
+        e
+        for e in all_evals
+        if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"
+    ]
     df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
     df_running = pd.DataFrame.from_records(running_list, columns=cols)
     df_finished = pd.DataFrame.from_records(finished_list, columns=cols)