Thang Pham commited on
Commit
2f1d640
·
1 Parent(s): 3410bfc

Update task names and leaderboard display

Browse files
Files changed (5) hide show
  1. app.py +33 -8
  2. src/about.py +32 -2
  3. src/display/utils.py +49 -11
  4. src/leaderboard/read_evals.py +31 -20
  5. src/populate.py +13 -4
app.py CHANGED
@@ -22,9 +22,17 @@ from src.display.utils import (
22
  ModelType,
23
  fields,
24
  WeightType,
25
- Precision
 
 
 
 
 
 
 
 
 
26
  )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
  from src.submission.submit import add_new_eval
30
 
@@ -32,24 +40,35 @@ from src.submission.submit import add_new_eval
32
  def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
34
 
 
35
  ### Space initialisation
36
  try:
37
  print(EVAL_REQUESTS_PATH)
38
  snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
 
 
 
 
 
40
  )
41
  except Exception:
42
  restart_space()
43
  try:
44
  print(EVAL_RESULTS_PATH)
45
  snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
 
 
 
 
 
47
  )
48
  except Exception:
49
  restart_space()
50
 
51
-
52
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 
53
 
54
  (
55
  finished_eval_queue_df,
@@ -57,6 +76,7 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
57
  pending_eval_queue_df,
58
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
 
60
  def init_leaderboard(dataframe):
61
  if dataframe is None or dataframe.empty:
62
  raise ValueError("Leaderboard DataFrame is empty or None.")
@@ -81,7 +101,10 @@ def init_leaderboard(dataframe):
81
  label="Select the number of parameters (B)",
82
  ),
83
  ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
 
 
 
85
  ),
86
  ],
87
  bool_checkboxgroup_label="Hide models",
@@ -171,7 +194,9 @@ with demo:
171
  value="Original",
172
  interactive=True,
173
  )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
 
 
175
 
176
  submit_button = gr.Button("Submit Eval")
177
  submission_result = gr.Markdown()
@@ -201,4 +226,4 @@ with demo:
201
  scheduler = BackgroundScheduler()
202
  scheduler.add_job(restart_space, "interval", seconds=1800)
203
  scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
22
  ModelType,
23
  fields,
24
  WeightType,
25
+ Precision,
26
+ )
27
+ from src.envs import (
28
+ API,
29
+ EVAL_REQUESTS_PATH,
30
+ EVAL_RESULTS_PATH,
31
+ QUEUE_REPO,
32
+ REPO_ID,
33
+ RESULTS_REPO,
34
+ TOKEN,
35
  )
 
36
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
37
  from src.submission.submit import add_new_eval
38
 
 
40
  def restart_space():
41
  API.restart_space(repo_id=REPO_ID)
42
 
43
+
44
  ### Space initialisation
45
  try:
46
  print(EVAL_REQUESTS_PATH)
47
  snapshot_download(
48
+ repo_id=QUEUE_REPO,
49
+ local_dir=EVAL_REQUESTS_PATH,
50
+ repo_type="dataset",
51
+ tqdm_class=None,
52
+ etag_timeout=30,
53
+ token=TOKEN,
54
  )
55
  except Exception:
56
  restart_space()
57
  try:
58
  print(EVAL_RESULTS_PATH)
59
  snapshot_download(
60
+ repo_id=RESULTS_REPO,
61
+ local_dir=EVAL_RESULTS_PATH,
62
+ repo_type="dataset",
63
+ tqdm_class=None,
64
+ etag_timeout=30,
65
+ token=TOKEN,
66
  )
67
  except Exception:
68
  restart_space()
69
 
 
70
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
71
+ LEADERBOARD_DF["T"] = range(1, len(LEADERBOARD_DF) + 1)
72
 
73
  (
74
  finished_eval_queue_df,
 
76
  pending_eval_queue_df,
77
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
78
 
79
+
80
  def init_leaderboard(dataframe):
81
  if dataframe is None or dataframe.empty:
82
  raise ValueError("Leaderboard DataFrame is empty or None.")
 
101
  label="Select the number of parameters (B)",
102
  ),
103
  ColumnFilter(
104
+ AutoEvalColumn.still_on_hub.name,
105
+ type="boolean",
106
+ label="Deleted/incomplete",
107
+ default=False,
108
  ),
109
  ],
110
  bool_checkboxgroup_label="Hide models",
 
194
  value="Original",
195
  interactive=True,
196
  )
197
+ base_model_name_textbox = gr.Textbox(
198
+ label="Base model (for delta or adapter weights)"
199
+ )
200
 
201
  submit_button = gr.Button("Submit Eval")
202
  submission_result = gr.Markdown()
 
226
  scheduler = BackgroundScheduler()
227
  scheduler.add_job(restart_space, "interval", seconds=1800)
228
  scheduler.start()
229
+ demo.queue(default_concurrency_limit=40).launch()
src/about.py CHANGED
@@ -15,6 +15,19 @@ class Tasks(Enum):
15
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
  task0 = Task("exp1", "accuracy", "name2smi")
17
  task1 = Task("exp2", "accuracy", "name2coord")
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
 
20
  NUM_FEWSHOT = 0 # Change with your few shot
@@ -22,11 +35,21 @@ NUM_FEWSHOT = 0 # Change with your few shot
22
 
23
 
24
  # Your leaderboard name
25
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
26
 
27
  # What does your leaderboard evaluate?
28
  INTRODUCTION_TEXT = """
29
- Intro text
 
 
 
 
 
 
 
 
 
 
30
  """
31
 
32
  # Which evaluations are you running? how can people reproduce what you have?
@@ -70,4 +93,11 @@ If everything is done, check you can launch the EleutherAIHarness on your model
70
 
71
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
72
  CITATION_BUTTON_TEXT = r"""
 
 
 
 
 
 
 
73
  """
 
15
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
  task0 = Task("exp1", "accuracy", "name2smi")
17
  task1 = Task("exp2", "accuracy", "name2coord")
18
+ task2 = Task("exp3", "accuracy", "name2opt")
19
+ task3 = Task("exp4", "accuracy", "name2vib")
20
+ task4 = Task("exp5", "accuracy", "name2gibbs")
21
+ task5 = Task("exp6", "accuracy", "name2file")
22
+ task6 = Task("exp7", "accuracy", "smi2coord")
23
+ task7 = Task("exp8", "accuracy", "smi2opt")
24
+ task8 = Task("exp9", "accuracy", "smi2vib")
25
+ task9 = Task("exp10", "accuracy", "smi2gibbs")
26
+ task10 = Task("exp11", "accuracy", "smi2file")
27
+ task11 = Task("exp12", "accuracy", "react2enthalpy")
28
+ task12 = Task("exp13", "accuracy", "react2gibbs")
29
+ task13 = Task("exp14", "accuracy", "react2enthalpy_multiagent")
30
+ task14 = Task("exp15", "accuracy", "react2gibbs_multiagent")
31
 
32
 
33
  NUM_FEWSHOT = 0 # Change with your few shot
 
35
 
36
 
37
  # Your leaderboard name
38
+ TITLE = """<h1 align="center" id="space-title">ChemGraph Leaderboard</h1>"""
39
 
40
  # What does your leaderboard evaluate?
41
  INTRODUCTION_TEXT = """
42
+ ChemGraph Leaderboard provides a reproducible evaluation of **agentic AI frameworks and large language models (LLMs)** for computational chemistry and materials science.
43
+
44
+ This leaderboard benchmarks models on a diverse set of tasks, including:
45
+ - Molecular geometry optimization, vibration analysis, and thermochemistry estimation.
46
+ - Reaction thermodynamics prediction (enthalpy, Gibbs free energy) .
47
+ - Tool-usage accuracy in multi-agent workflows.
48
+
49
+ Each model’s score reflects its ability to **follow structured tool protocols, generate physically meaningful results, and reason across chemistry-specific contexts**.
50
+ The benchmark results are generated offline and uploaded as part of the [**ChemGraph paper**](https://arxiv.org/abs/2506.06363).
51
+
52
+ Use this leaderboard to explore how different models and agents perform across core chemistry tasks, from small-molecule modeling to multi-step reaction workflows.
53
  """
54
 
55
  # Which evaluations are you running? how can people reproduce what you have?
 
93
 
94
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
95
  CITATION_BUTTON_TEXT = r"""
96
+ @article{pham2025chemgraph,
97
+ title={ChemGraph: An Agentic Framework for Computational Chemistry Workflows},
98
+ author={Pham, Thang D and Tanikanti, Aditya and Keçeli, Murat},
99
+ journal={arXiv preprint arXiv:2506.06363},
100
+ year={2025}
101
+ url={https://arxiv.org/abs/2506.06363}
102
+ }
103
  """
src/display/utils.py CHANGED
@@ -5,6 +5,7 @@ import pandas as pd
5
 
6
  from src.about import Tasks
7
 
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
10
 
@@ -20,29 +21,63 @@ class ColumnContent:
20
  hidden: bool = False
21
  never_hidden: bool = False
22
 
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
- #Scores
 
 
 
 
 
 
 
 
29
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 
 
 
 
32
  # Model information
33
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
 
 
 
 
 
 
 
 
36
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
  auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
 
 
 
 
39
  auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 
 
 
 
 
 
 
 
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
45
 
 
46
  ## For the queue columns in the submission tab
47
  @dataclass(frozen=True)
48
  class EvalQueueColumn: # Queue column
@@ -53,12 +88,13 @@ class EvalQueueColumn: # Queue column
53
  weight_type = ColumnContent("weight_type", "str", "Original")
54
  status = ColumnContent("status", "str", True)
55
 
 
56
  ## All the model information that we might need
57
  @dataclass
58
  class ModelDetails:
59
  name: str
60
  display_name: str = ""
61
- symbol: str = "" # emoji
62
 
63
 
64
  class ModelType(Enum):
@@ -83,11 +119,13 @@ class ModelType(Enum):
83
  return ModelType.IFT
84
  return ModelType.Unknown
85
 
 
86
  class WeightType(Enum):
87
  Adapter = ModelDetails("Adapter")
88
  Original = ModelDetails("Original")
89
  Delta = ModelDetails("Delta")
90
 
 
91
  class Precision(Enum):
92
  float16 = ModelDetails("float16")
93
  bfloat16 = ModelDetails("bfloat16")
@@ -100,6 +138,7 @@ class Precision(Enum):
100
  return Precision.bfloat16
101
  return Precision.Unknown
102
 
 
103
  # Column selection
104
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
105
 
@@ -107,4 +146,3 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
107
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
 
109
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
110
-
 
5
 
6
  from src.about import Tasks
7
 
8
+
9
  def fields(raw_class):
10
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
11
 
 
21
  hidden: bool = False
22
  never_hidden: bool = False
23
 
24
+
25
  ## Leaderboard columns
26
  auto_eval_column_dict = []
27
  # Init
28
+ auto_eval_column_dict.append([
29
+ "model_type_symbol",
30
+ ColumnContent,
31
+ ColumnContent("T", "str", True, never_hidden=True),
32
+ ])
33
+ auto_eval_column_dict.append([
34
+ "model",
35
+ ColumnContent,
36
+ ColumnContent("Model", "markdown", True, never_hidden=True),
37
+ ])
38
+ # Scores
39
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
40
  for task in Tasks:
41
+ auto_eval_column_dict.append([
42
+ task.name,
43
+ ColumnContent,
44
+ ColumnContent(task.value.col_name, "number", True),
45
+ ])
46
  # Model information
47
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
48
+ auto_eval_column_dict.append([
49
+ "architecture",
50
+ ColumnContent,
51
+ ColumnContent("Architecture", "str", False),
52
+ ])
53
+ auto_eval_column_dict.append([
54
+ "weight_type",
55
+ ColumnContent,
56
+ ColumnContent("Weight type", "str", False, True),
57
+ ])
58
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
59
  auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
60
+ auto_eval_column_dict.append([
61
+ "params",
62
+ ColumnContent,
63
+ ColumnContent("#Params (B)", "number", False),
64
+ ])
65
  auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
66
+ auto_eval_column_dict.append([
67
+ "still_on_hub",
68
+ ColumnContent,
69
+ ColumnContent("Available on the hub", "bool", False),
70
+ ])
71
+ auto_eval_column_dict.append([
72
+ "revision",
73
+ ColumnContent,
74
+ ColumnContent("Model sha", "str", False, False),
75
+ ])
76
 
77
  # We use make dataclass to dynamically fill the scores from Tasks
78
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
79
 
80
+
81
  ## For the queue columns in the submission tab
82
  @dataclass(frozen=True)
83
  class EvalQueueColumn: # Queue column
 
88
  weight_type = ColumnContent("weight_type", "str", "Original")
89
  status = ColumnContent("status", "str", True)
90
 
91
+
92
  ## All the model information that we might need
93
  @dataclass
94
  class ModelDetails:
95
  name: str
96
  display_name: str = ""
97
+ symbol: str = "" # emoji
98
 
99
 
100
  class ModelType(Enum):
 
119
  return ModelType.IFT
120
  return ModelType.Unknown
121
 
122
+
123
  class WeightType(Enum):
124
  Adapter = ModelDetails("Adapter")
125
  Original = ModelDetails("Original")
126
  Delta = ModelDetails("Delta")
127
 
128
+
129
  class Precision(Enum):
130
  float16 = ModelDetails("float16")
131
  bfloat16 = ModelDetails("bfloat16")
 
138
  return Precision.bfloat16
139
  return Precision.Unknown
140
 
141
+
142
  # Column selection
143
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
144
 
 
146
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
147
 
148
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 
src/leaderboard/read_evals.py CHANGED
@@ -14,22 +14,22 @@ from src.submission.check_validity import is_model_on_hub
14
 
15
  @dataclass
16
  class EvalResult:
17
- """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
- """
19
- eval_name: str # org_model_precision (uid)
20
- full_model: str # org/model (path on hub)
21
- org: str
22
  model: str
23
- revision: str # commit hash, "" if main
24
  results: dict
25
  precision: Precision = Precision.Unknown
26
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
- weight_type: WeightType = WeightType.Original # Original or Adapter
28
- architecture: str = "Unknown"
29
  license: str = "?"
30
  likes: int = 0
31
  num_params: int = 0
32
- date: str = "" # submission date of request file
33
  still_on_hub: bool = False
34
 
35
  @classmethod
@@ -58,7 +58,10 @@ class EvalResult:
58
  full_model = "/".join(org_and_model)
59
 
60
  still_on_hub, _, model_config = is_model_on_hub(
61
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
 
 
 
62
  )
63
  architecture = "?"
64
  if model_config is not None:
@@ -72,7 +75,9 @@ class EvalResult:
72
  task = task.value
73
 
74
  # We average all scores of a given metric (not all metrics are present in all files)
75
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
 
 
76
  if accs.size == 0 or any([acc is None for acc in accs]):
77
  continue
78
 
@@ -85,15 +90,17 @@ class EvalResult:
85
  org=org,
86
  model=model,
87
  results=results,
88
- precision=precision,
89
- revision= config.get("model_sha", ""),
90
  still_on_hub=still_on_hub,
91
- architecture=architecture
92
  )
93
 
94
  def update_with_request_file(self, requests_path):
95
  """Finds the relevant request file for the current model and updates info with it"""
96
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
 
 
97
 
98
  try:
99
  with open(request_file, "r") as f:
@@ -105,7 +112,9 @@ class EvalResult:
105
  self.num_params = request.get("params", 0)
106
  self.date = request.get("submitted_time", "")
107
  except Exception:
108
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
 
 
109
 
110
  def to_dict(self):
111
  """Converts the Eval Result to a dict compatible with our dataframe display"""
@@ -171,8 +180,8 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
171
 
172
  for file in files:
173
  model_result_filepaths.append(os.path.join(root, file))
174
-
175
  eval_results = {}
 
176
  for model_result_filepath in model_result_filepaths:
177
  # Creation of result
178
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
@@ -181,14 +190,16 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
181
  # Store results of same eval together
182
  eval_name = eval_result.eval_name
183
  if eval_name in eval_results.keys():
184
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
 
 
185
  else:
186
  eval_results[eval_name] = eval_result
187
 
188
  results = []
189
  for v in eval_results.values():
190
  try:
191
- v.to_dict() # we test if the dict version is complete
192
  results.append(v)
193
  except KeyError: # not all eval values present
194
  continue
 
14
 
15
  @dataclass
16
  class EvalResult:
17
+ """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
18
+
19
+ eval_name: str # org_model_precision (uid)
20
+ full_model: str # org/model (path on hub)
21
+ org: str
22
  model: str
23
+ revision: str # commit hash, "" if main
24
  results: dict
25
  precision: Precision = Precision.Unknown
26
+ model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
+ weight_type: WeightType = WeightType.Original # Original or Adapter
28
+ architecture: str = "Unknown"
29
  license: str = "?"
30
  likes: int = 0
31
  num_params: int = 0
32
+ date: str = "" # submission date of request file
33
  still_on_hub: bool = False
34
 
35
  @classmethod
 
58
  full_model = "/".join(org_and_model)
59
 
60
  still_on_hub, _, model_config = is_model_on_hub(
61
+ full_model,
62
+ config.get("model_sha", "main"),
63
+ trust_remote_code=True,
64
+ test_tokenizer=False,
65
  )
66
  architecture = "?"
67
  if model_config is not None:
 
75
  task = task.value
76
 
77
  # We average all scores of a given metric (not all metrics are present in all files)
78
+ accs = np.array([
79
+ v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k
80
+ ])
81
  if accs.size == 0 or any([acc is None for acc in accs]):
82
  continue
83
 
 
90
  org=org,
91
  model=model,
92
  results=results,
93
+ precision=precision,
94
+ revision=config.get("model_sha", ""),
95
  still_on_hub=still_on_hub,
96
+ architecture=architecture,
97
  )
98
 
99
  def update_with_request_file(self, requests_path):
100
  """Finds the relevant request file for the current model and updates info with it"""
101
+ request_file = get_request_file_for_model(
102
+ requests_path, self.full_model, self.precision.value.name
103
+ )
104
 
105
  try:
106
  with open(request_file, "r") as f:
 
112
  self.num_params = request.get("params", 0)
113
  self.date = request.get("submitted_time", "")
114
  except Exception:
115
+ print(
116
+ f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}"
117
+ )
118
 
119
  def to_dict(self):
120
  """Converts the Eval Result to a dict compatible with our dataframe display"""
 
180
 
181
  for file in files:
182
  model_result_filepaths.append(os.path.join(root, file))
 
183
  eval_results = {}
184
+ print(f"MODEL FILE PATHS: {model_result_filepaths}")
185
  for model_result_filepath in model_result_filepaths:
186
  # Creation of result
187
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
 
190
  # Store results of same eval together
191
  eval_name = eval_result.eval_name
192
  if eval_name in eval_results.keys():
193
+ eval_results[eval_name].results.update({
194
+ k: v for k, v in eval_result.results.items() if v is not None
195
+ })
196
  else:
197
  eval_results[eval_name] = eval_result
198
 
199
  results = []
200
  for v in eval_results.values():
201
  try:
202
+ v.to_dict() # we test if the dict version is complete
203
  results.append(v)
204
  except KeyError: # not all eval values present
205
  continue
src/populate.py CHANGED
@@ -8,11 +8,12 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
 
 
12
  """Creates a dataframe from all the individual experiment results"""
13
  raw_data = get_raw_eval_results(results_path, requests_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
15
-
16
  df = pd.DataFrame.from_records(all_data_json)
17
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
  df = df[cols].round(decimals=2)
@@ -39,7 +40,11 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
39
  all_evals.append(data)
40
  elif ".md" not in entry:
41
  # this is a folder
42
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
 
 
 
 
43
  for sub_entry in sub_entries:
44
  file_path = os.path.join(save_path, entry, sub_entry)
45
  with open(file_path) as fp:
@@ -51,7 +56,11 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
51
 
52
  pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
53
  running_list = [e for e in all_evals if e["status"] == "RUNNING"]
54
- finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
 
 
 
 
55
  df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
56
  df_running = pd.DataFrame.from_records(running_list, columns=cols)
57
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
 
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
+ def get_leaderboard_df(
12
+ results_path: str, requests_path: str, cols: list, benchmark_cols: list
13
+ ) -> pd.DataFrame:
14
  """Creates a dataframe from all the individual experiment results"""
15
  raw_data = get_raw_eval_results(results_path, requests_path)
16
  all_data_json = [v.to_dict() for v in raw_data]
 
17
  df = pd.DataFrame.from_records(all_data_json)
18
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
19
  df = df[cols].round(decimals=2)
 
40
  all_evals.append(data)
41
  elif ".md" not in entry:
42
  # this is a folder
43
+ sub_entries = [
44
+ e
45
+ for e in os.listdir(f"{save_path}/{entry}")
46
+ if os.path.isfile(e) and not e.startswith(".")
47
+ ]
48
  for sub_entry in sub_entries:
49
  file_path = os.path.join(save_path, entry, sub_entry)
50
  with open(file_path) as fp:
 
56
 
57
  pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
58
  running_list = [e for e in all_evals if e["status"] == "RUNNING"]
59
+ finished_list = [
60
+ e
61
+ for e in all_evals
62
+ if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"
63
+ ]
64
  df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
65
  df_running = pd.DataFrame.from_records(running_list, columns=cols)
66
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)