Spaces:
Runtime error
Runtime error
Ahmed Ahmed
commited on
Commit
·
c99a049
1
Parent(s):
25de5ef
consolidate
Browse files- src/display/utils.py +2 -2
- src/leaderboard/read_evals.py +21 -17
src/display/utils.py
CHANGED
|
@@ -28,8 +28,8 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
|
|
| 28 |
#Scores
|
| 29 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
| 30 |
for task in Tasks:
|
| 31 |
-
#
|
| 32 |
-
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(
|
| 33 |
# Model information
|
| 34 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
| 35 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
|
|
|
| 28 |
#Scores
|
| 29 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
| 30 |
for task in Tasks:
|
| 31 |
+
# Use exact column name from Tasks
|
| 32 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
| 33 |
# Model information
|
| 34 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
| 35 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -78,9 +78,11 @@ class EvalResult:
|
|
| 78 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 79 |
# Calculate average, handling perplexity (lower is better)
|
| 80 |
scores = []
|
|
|
|
| 81 |
for task in Tasks:
|
| 82 |
if task.value.benchmark in self.results:
|
| 83 |
score = self.results[task.value.benchmark]
|
|
|
|
| 84 |
# Convert perplexity to a 0-100 scale where lower perplexity = higher score
|
| 85 |
# Using a log scale since perplexity can vary widely
|
| 86 |
# Cap at 100 for very low perplexity and 0 for very high perplexity
|
|
@@ -106,14 +108,11 @@ class EvalResult:
|
|
| 106 |
AutoEvalColumn.likes.name: 0, # Default likes
|
| 107 |
}
|
| 108 |
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
data_dict[f"{task.value.col_name} ⬇️"] = score
|
| 115 |
-
else:
|
| 116 |
-
data_dict[f"{task.value.col_name} ⬇️"] = None
|
| 117 |
|
| 118 |
return data_dict
|
| 119 |
|
|
@@ -131,22 +130,27 @@ def get_raw_eval_results(results_path: str) -> list[EvalResult]:
|
|
| 131 |
|
| 132 |
eval_results = {}
|
| 133 |
for model_result_filepath in model_result_filepaths:
|
| 134 |
-
|
| 135 |
-
|
|
|
|
| 136 |
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
|
|
|
|
|
|
|
|
|
| 143 |
|
| 144 |
results = []
|
| 145 |
for v in eval_results.values():
|
| 146 |
try:
|
| 147 |
v.to_dict() # we test if the dict version is complete
|
| 148 |
results.append(v)
|
| 149 |
-
except KeyError: # not all eval values present
|
|
|
|
| 150 |
continue
|
| 151 |
|
| 152 |
return results
|
|
|
|
| 78 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 79 |
# Calculate average, handling perplexity (lower is better)
|
| 80 |
scores = []
|
| 81 |
+
perplexity_score = None
|
| 82 |
for task in Tasks:
|
| 83 |
if task.value.benchmark in self.results:
|
| 84 |
score = self.results[task.value.benchmark]
|
| 85 |
+
perplexity_score = score # Save the raw score
|
| 86 |
# Convert perplexity to a 0-100 scale where lower perplexity = higher score
|
| 87 |
# Using a log scale since perplexity can vary widely
|
| 88 |
# Cap at 100 for very low perplexity and 0 for very high perplexity
|
|
|
|
| 108 |
AutoEvalColumn.likes.name: 0, # Default likes
|
| 109 |
}
|
| 110 |
|
| 111 |
+
# Add perplexity score with the exact column name from Tasks
|
| 112 |
+
if perplexity_score is not None:
|
| 113 |
+
data_dict[Tasks.task0.value.col_name] = perplexity_score
|
| 114 |
+
else:
|
| 115 |
+
data_dict[Tasks.task0.value.col_name] = None
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
return data_dict
|
| 118 |
|
|
|
|
| 130 |
|
| 131 |
eval_results = {}
|
| 132 |
for model_result_filepath in model_result_filepaths:
|
| 133 |
+
try:
|
| 134 |
+
# Creation of result
|
| 135 |
+
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
| 136 |
|
| 137 |
+
# Store results of same eval together
|
| 138 |
+
eval_name = eval_result.eval_name
|
| 139 |
+
if eval_name in eval_results.keys():
|
| 140 |
+
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
| 141 |
+
else:
|
| 142 |
+
eval_results[eval_name] = eval_result
|
| 143 |
+
except Exception as e:
|
| 144 |
+
print(f"Error processing result file {model_result_filepath}: {e}")
|
| 145 |
+
continue
|
| 146 |
|
| 147 |
results = []
|
| 148 |
for v in eval_results.values():
|
| 149 |
try:
|
| 150 |
v.to_dict() # we test if the dict version is complete
|
| 151 |
results.append(v)
|
| 152 |
+
except KeyError as e: # not all eval values present
|
| 153 |
+
print(f"Error converting result to dict: {e}")
|
| 154 |
continue
|
| 155 |
|
| 156 |
return results
|