Commit
·
3a84b3b
1
Parent(s):
5f70754
chore: Small update
Browse files
app.py
CHANGED
|
@@ -45,12 +45,12 @@ the models 10 times with bootstrapped test sets and different few-shot examples
|
|
| 45 |
iteration. This allows us to better measure the uncertainty of the results. We use the
|
| 46 |
uncertainty in the radial plot when we compute the rank scores for the models. Namely,
|
| 47 |
we compute the rank score by firstly computing the rank of the model on each task,
|
| 48 |
-
where two models are considered to have the same rank if
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
|
| 55 |
## The Benchmark Datasets
|
| 56 |
|
|
@@ -551,9 +551,9 @@ def produce_radial_plot(
|
|
| 551 |
ranks.append(rank)
|
| 552 |
|
| 553 |
log_ranks = np.log(ranks)
|
| 554 |
-
scores = log_ranks / log_ranks.max()
|
| 555 |
for model_id, score in zip(model_ids_sorted, scores):
|
| 556 |
-
all_rank_scores[task][language][model_id] =
|
| 557 |
logger.info("Successfully computed rank scores.")
|
| 558 |
|
| 559 |
# Add all the evaluation results for each model
|
|
@@ -568,15 +568,13 @@ def produce_radial_plot(
|
|
| 568 |
if model_id not in results_dfs_filtered[language].index:
|
| 569 |
continue
|
| 570 |
|
| 571 |
-
score_list = results_dfs_filtered[language].loc[model_id][task]
|
| 572 |
-
|
| 573 |
rank_score = 100 * all_rank_scores[task][language][model_id]
|
| 574 |
rank_scores.append(rank_score)
|
| 575 |
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
scores.append(
|
| 580 |
if use_rank_score:
|
| 581 |
result_list.append(np.mean(rank_scores))
|
| 582 |
else:
|
|
|
|
| 45 |
iteration. This allows us to better measure the uncertainty of the results. We use the
|
| 46 |
uncertainty in the radial plot when we compute the rank scores for the models. Namely,
|
| 47 |
we compute the rank score by firstly computing the rank of the model on each task,
|
| 48 |
+
where two models are considered to have the same rank if there is not a statistically
|
| 49 |
+
significant difference between their scores (one-tailed t-test with p < 0.05). We next
|
| 50 |
+
apply a logaritmic transformation to the ranks, to downplay the importance of the
|
| 51 |
+
poorly performing models. Lastly, we invert and normalise the logaritmic ranks to the
|
| 52 |
+
range [0, 1], resulting in the best performing models having rank scores close to 1 and
|
| 53 |
+
the worst performing models having rank scores close to 0.
|
| 54 |
|
| 55 |
## The Benchmark Datasets
|
| 56 |
|
|
|
|
| 551 |
ranks.append(rank)
|
| 552 |
|
| 553 |
log_ranks = np.log(ranks)
|
| 554 |
+
scores = 1 - (log_ranks / log_ranks.max())
|
| 555 |
for model_id, score in zip(model_ids_sorted, scores):
|
| 556 |
+
all_rank_scores[task][language][model_id] = score
|
| 557 |
logger.info("Successfully computed rank scores.")
|
| 558 |
|
| 559 |
# Add all the evaluation results for each model
|
|
|
|
| 568 |
if model_id not in results_dfs_filtered[language].index:
|
| 569 |
continue
|
| 570 |
|
|
|
|
|
|
|
| 571 |
rank_score = 100 * all_rank_scores[task][language][model_id]
|
| 572 |
rank_scores.append(rank_score)
|
| 573 |
|
| 574 |
+
score_arr = np.array(results_dfs_filtered[language].loc[model_id][task])
|
| 575 |
+
if score_arr.mean() < 1:
|
| 576 |
+
score_arr *= 100
|
| 577 |
+
scores.append(score_arr.mean())
|
| 578 |
if use_rank_score:
|
| 579 |
result_list.append(np.mean(rank_scores))
|
| 580 |
else:
|