Spaces:
Runtime error
Runtime error
Commit
·
ead2260
1
Parent(s):
f12b6ec
fix: Correct evaluation result mapping and display
Browse files- Update evaluation functions to return results with dataset names as keys
- Modify read_evals.py to map metric values correctly to dataset names
- Improve leaderboard display by:
- Increasing decimal precision to 4 places
- Re-enabling NaN value filtering
- Maintaining proper sorting by average score
- src/evaluator/evaluate.py +4 -4
- src/leaderboard/read_evals.py +18 -8
- src/populate.py +2 -4
src/evaluator/evaluate.py
CHANGED
|
@@ -149,7 +149,7 @@ def evaluate_tsac_sentiment(model, tokenizer, device):
|
|
| 149 |
print(f"Total predictions: {total}")
|
| 150 |
print(f"Accuracy: {accuracy:.4f}")
|
| 151 |
|
| 152 |
-
return {"
|
| 153 |
except Exception as e:
|
| 154 |
print(f"\n=== Error in TSAC evaluation: {str(e)} ===")
|
| 155 |
print(f"Full traceback: {traceback.format_exc()}")
|
|
@@ -187,7 +187,7 @@ def evaluate_tunisian_corpus_coverage(model, tokenizer, device):
|
|
| 187 |
|
| 188 |
coverage = covered_tokens / total_tokens if total_tokens > 0 else 0
|
| 189 |
print(f"Tunisian Corpus Coverage: {coverage:.2%}")
|
| 190 |
-
return {"
|
| 191 |
except Exception as e:
|
| 192 |
print(f"Error in Tunisian Corpus evaluation: {str(e)}")
|
| 193 |
print(f"Full traceback: {traceback.format_exc()}")
|
|
@@ -289,8 +289,8 @@ def evaluate_model(model_name: str, revision: str, precision: str, weight_type:
|
|
| 289 |
precision=precision,
|
| 290 |
weight_type=weight_type,
|
| 291 |
results={
|
| 292 |
-
|
| 293 |
-
|
| 294 |
}
|
| 295 |
)
|
| 296 |
except Exception as e:
|
|
|
|
| 149 |
print(f"Total predictions: {total}")
|
| 150 |
print(f"Accuracy: {accuracy:.4f}")
|
| 151 |
|
| 152 |
+
return {"fbougares/tsac": accuracy}
|
| 153 |
except Exception as e:
|
| 154 |
print(f"\n=== Error in TSAC evaluation: {str(e)} ===")
|
| 155 |
print(f"Full traceback: {traceback.format_exc()}")
|
|
|
|
| 187 |
|
| 188 |
coverage = covered_tokens / total_tokens if total_tokens > 0 else 0
|
| 189 |
print(f"Tunisian Corpus Coverage: {coverage:.2%}")
|
| 190 |
+
return {"arbml/Tunisian_Dialect_Corpus": coverage}
|
| 191 |
except Exception as e:
|
| 192 |
print(f"Error in Tunisian Corpus evaluation: {str(e)}")
|
| 193 |
print(f"Full traceback: {traceback.format_exc()}")
|
|
|
|
| 289 |
precision=precision,
|
| 290 |
weight_type=weight_type,
|
| 291 |
results={
|
| 292 |
+
Tasks.tsac_sentiment.value.metric: tsac_results.get(Tasks.tsac_sentiment.value.metric),
|
| 293 |
+
Tasks.tunisian_corpus.value.metric: tunisian_results.get(Tasks.tunisian_corpus.value.metric)
|
| 294 |
}
|
| 295 |
)
|
| 296 |
except Exception as e:
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -154,9 +154,17 @@ class EvalResult:
|
|
| 154 |
AutoEvalColumnInstance.still_on_hub.name: True if isinstance(self.still_on_hub, tuple) and self.still_on_hub[0] else False,
|
| 155 |
}
|
| 156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
for task in Tasks:
|
| 158 |
-
|
| 159 |
-
|
|
|
|
|
|
|
|
|
|
| 160 |
return data_dict
|
| 161 |
|
| 162 |
|
|
@@ -217,24 +225,26 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
| 217 |
|
| 218 |
# Store results of same eval together
|
| 219 |
eval_name = eval_result.eval_name
|
|
|
|
| 220 |
if eval_name in eval_results.keys():
|
| 221 |
# If we already have results for this eval, append to list
|
| 222 |
eval_results[eval_name].append(eval_result)
|
| 223 |
else:
|
| 224 |
# Initialize list for this eval name
|
| 225 |
eval_results[eval_name] = [eval_result]
|
| 226 |
-
|
| 227 |
# Process final results
|
| 228 |
final_results = {}
|
| 229 |
for eval_name, eval_list in eval_results.items():
|
| 230 |
# Create merged results from all evaluations, ensuring all required task keys are present
|
| 231 |
-
merged_results = {task.value.
|
| 232 |
for eval_result in eval_list:
|
| 233 |
merged_results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
| 234 |
|
| 235 |
# Take the first eval_result as base and update with merged results
|
| 236 |
-
print("evaluation list : ", eval_list)
|
| 237 |
base_result = eval_list[0]
|
|
|
|
| 238 |
# print(base_result)
|
| 239 |
final_results[eval_name] = EvalResult(
|
| 240 |
eval_name=eval_name,
|
|
@@ -249,12 +259,12 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
| 249 |
date=base_result.date,
|
| 250 |
still_on_hub=base_result.still_on_hub
|
| 251 |
)
|
| 252 |
-
print(final_results)
|
|
|
|
|
|
|
| 253 |
|
| 254 |
results = []
|
| 255 |
for v in final_results.values():
|
| 256 |
-
print("v : ",v)
|
| 257 |
-
print("Merged results: ", v.results)
|
| 258 |
try:
|
| 259 |
v.to_dict() # we test if the dict version is complete
|
| 260 |
results.append(v)
|
|
|
|
| 154 |
AutoEvalColumnInstance.still_on_hub.name: True if isinstance(self.still_on_hub, tuple) and self.still_on_hub[0] else False,
|
| 155 |
}
|
| 156 |
|
| 157 |
+
# Map dataset names to their metric values
|
| 158 |
+
tsac_result = self.results.get("fbougares/tsac")
|
| 159 |
+
tunisian_result = self.results.get("arbml/Tunisian_Dialect_Corpus")
|
| 160 |
+
|
| 161 |
+
# Map metric values to their corresponding dataset names
|
| 162 |
for task in Tasks:
|
| 163 |
+
if task.value.benchmark == "fbougares/tsac":
|
| 164 |
+
data_dict[task.value.col_name] = self.results.get("accuracy")
|
| 165 |
+
elif task.value.benchmark == "arbml/Tunisian_Dialect_Corpus":
|
| 166 |
+
data_dict[task.value.col_name] = self.results.get("coverage")
|
| 167 |
+
print("data_dict : ", data_dict)
|
| 168 |
return data_dict
|
| 169 |
|
| 170 |
|
|
|
|
| 225 |
|
| 226 |
# Store results of same eval together
|
| 227 |
eval_name = eval_result.eval_name
|
| 228 |
+
print("eval_name : ", eval_name)
|
| 229 |
if eval_name in eval_results.keys():
|
| 230 |
# If we already have results for this eval, append to list
|
| 231 |
eval_results[eval_name].append(eval_result)
|
| 232 |
else:
|
| 233 |
# Initialize list for this eval name
|
| 234 |
eval_results[eval_name] = [eval_result]
|
| 235 |
+
print("eval_results : ", eval_results)
|
| 236 |
# Process final results
|
| 237 |
final_results = {}
|
| 238 |
for eval_name, eval_list in eval_results.items():
|
| 239 |
# Create merged results from all evaluations, ensuring all required task keys are present
|
| 240 |
+
merged_results = {task.value.metric: None for task in Tasks}
|
| 241 |
for eval_result in eval_list:
|
| 242 |
merged_results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
| 243 |
|
| 244 |
# Take the first eval_result as base and update with merged results
|
| 245 |
+
print("evaluation list : ", len(eval_list))
|
| 246 |
base_result = eval_list[0]
|
| 247 |
+
print("base_result : ", base_result)
|
| 248 |
# print(base_result)
|
| 249 |
final_results[eval_name] = EvalResult(
|
| 250 |
eval_name=eval_name,
|
|
|
|
| 259 |
date=base_result.date,
|
| 260 |
still_on_hub=base_result.still_on_hub
|
| 261 |
)
|
| 262 |
+
print(len(final_results))
|
| 263 |
+
print(final_results.keys())
|
| 264 |
+
print(final_results.values())
|
| 265 |
|
| 266 |
results = []
|
| 267 |
for v in final_results.values():
|
|
|
|
|
|
|
| 268 |
try:
|
| 269 |
v.to_dict() # we test if the dict version is complete
|
| 270 |
results.append(v)
|
src/populate.py
CHANGED
|
@@ -20,11 +20,9 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
| 20 |
print("No evaluation results found. Returning empty DataFrame with correct columns.")
|
| 21 |
return pd.DataFrame(columns=cols)
|
| 22 |
df = df.sort_values(by=[AutoEvalColumn().average.name], ascending=False)
|
| 23 |
-
print(df)
|
| 24 |
-
df = df[cols].round(decimals=2)
|
| 25 |
-
print(df)
|
| 26 |
-
# df = df[has_no_nan_values(df, benchmark_cols)]
|
| 27 |
# print(df)
|
|
|
|
|
|
|
| 28 |
return df
|
| 29 |
|
| 30 |
|
|
|
|
| 20 |
print("No evaluation results found. Returning empty DataFrame with correct columns.")
|
| 21 |
return pd.DataFrame(columns=cols)
|
| 22 |
df = df.sort_values(by=[AutoEvalColumn().average.name], ascending=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
# print(df)
|
| 24 |
+
df = df[cols].round(decimals=4)
|
| 25 |
+
df = df[has_no_nan_values(df, benchmark_cols)]
|
| 26 |
return df
|
| 27 |
|
| 28 |
|