Commit
·
576340d
1
Parent(s):
ada1f6c
chore: Revert last change
Browse files
app.py
CHANGED
|
@@ -555,19 +555,18 @@ def produce_radial_plot(
|
|
| 555 |
for language in languages:
|
| 556 |
if model_id not in results_dfs_filtered[language].index:
|
| 557 |
continue
|
| 558 |
-
|
| 559 |
score_list = results_dfs_filtered[language].loc[model_id][task]
|
| 560 |
-
if all(score < 1 for score in score_list):
|
| 561 |
-
score_list = [100 * score for score in score_list]
|
| 562 |
-
|
| 563 |
win_ratio = 100 * np.mean([
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
)
|
| 567 |
for other_scores in results_dfs_filtered[language][task].dropna().drop(index=model_id)
|
| 568 |
])
|
| 569 |
win_ratios.append(win_ratio)
|
| 570 |
|
|
|
|
|
|
|
|
|
|
| 571 |
scores.append(np.mean(score_list))
|
| 572 |
if use_win_ratio:
|
| 573 |
result_list.append(np.mean(win_ratios))
|
|
@@ -688,48 +687,5 @@ def fetch_results() -> dict[Language, pd.DataFrame]:
|
|
| 688 |
|
| 689 |
return results_dfs
|
| 690 |
|
| 691 |
-
|
| 692 |
-
def scores_statistically_better(
|
| 693 |
-
score_values_1: list[float], score_values_2: list[float]
|
| 694 |
-
) -> bool:
|
| 695 |
-
"""Determine whether the first score group is statistically better than the second.
|
| 696 |
-
|
| 697 |
-
Args:
|
| 698 |
-
score_values_1:
|
| 699 |
-
The scores for the first group.
|
| 700 |
-
score_values_2:
|
| 701 |
-
The scores for the second group.
|
| 702 |
-
|
| 703 |
-
Returns:
|
| 704 |
-
Whether the first score group is statistically better than the second.
|
| 705 |
-
"""
|
| 706 |
-
assert len(score_values_1) == len(score_values_2), (
|
| 707 |
-
"The two score groups must have the same length."
|
| 708 |
-
)
|
| 709 |
-
|
| 710 |
-
# Separate the scores into groups of 10, consisting of the scores for each
|
| 711 |
-
# dataset
|
| 712 |
-
group_scores_1 = [
|
| 713 |
-
score_values_1[idx:idx+10] for idx in range(0, len(score_values_1), 10)
|
| 714 |
-
]
|
| 715 |
-
group_scores_2 = [
|
| 716 |
-
score_values_2[idx:idx+10] for idx in range(0, len(score_values_2), 10)
|
| 717 |
-
]
|
| 718 |
-
|
| 719 |
-
# Compute t-statistics for each group separately, and compute the mean
|
| 720 |
-
# t-statistic
|
| 721 |
-
t_statistics = [
|
| 722 |
-
stats.ttest_ind(a=group_1, b=group_2, alternative="greater").statistic
|
| 723 |
-
for group_1, group_2 in zip(group_scores_1, group_scores_2)
|
| 724 |
-
]
|
| 725 |
-
mean_t_statistic = np.mean(t_statistics)
|
| 726 |
-
|
| 727 |
-
# Compute the p-value for the mean t-statistic, where the null hypothesis is
|
| 728 |
-
# that the first group does not have a larger mean score than the second group
|
| 729 |
-
degrees_of_freedom = len(score_values_1) - 1
|
| 730 |
-
p_value = 1 - stats.t.cdf(abs(mean_t_statistic), degrees_of_freedom)
|
| 731 |
-
|
| 732 |
-
return p_value < 0.05
|
| 733 |
-
|
| 734 |
if __name__ == "__main__":
|
| 735 |
main()
|
|
|
|
| 555 |
for language in languages:
|
| 556 |
if model_id not in results_dfs_filtered[language].index:
|
| 557 |
continue
|
|
|
|
| 558 |
score_list = results_dfs_filtered[language].loc[model_id][task]
|
|
|
|
|
|
|
|
|
|
| 559 |
win_ratio = 100 * np.mean([
|
| 560 |
+
stats.ttest_rel(
|
| 561 |
+
a=score_list, b=other_scores, alternative="greater"
|
| 562 |
+
).pvalue < 0.05
|
| 563 |
for other_scores in results_dfs_filtered[language][task].dropna().drop(index=model_id)
|
| 564 |
])
|
| 565 |
win_ratios.append(win_ratio)
|
| 566 |
|
| 567 |
+
if all(score < 1 for score in score_list):
|
| 568 |
+
score_list = [100 * score for score in score_list]
|
| 569 |
+
|
| 570 |
scores.append(np.mean(score_list))
|
| 571 |
if use_win_ratio:
|
| 572 |
result_list.append(np.mean(win_ratios))
|
|
|
|
| 687 |
|
| 688 |
return results_dfs
|
| 689 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 690 |
if __name__ == "__main__":
|
| 691 |
main()
|