Spaces:

alexandrainst
/

radial-plot-generator

Running

App Files Files Community

saattrupdan commited on Jan 29, 2024

Commit

576340d

1 Parent(s): ada1f6c

chore: Revert last change

Browse files

Files changed (1) hide show

app.py +6 -50

app.py CHANGED Viewed

@@ -555,19 +555,18 @@ def produce_radial_plot(
             for language in languages:
                 if model_id not in results_dfs_filtered[language].index:
                     continue
                 score_list = results_dfs_filtered[language].loc[model_id][task]
-                if all(score < 1 for score in score_list):
-                    score_list = [100 * score for score in score_list]
                 win_ratio = 100 * np.mean([
-                    scores_statistically_better(
-                        score_values_1=score_list, score_values_2=other_scores
-                    )
                     for other_scores in results_dfs_filtered[language][task].dropna().drop(index=model_id)
                 ])
                 win_ratios.append(win_ratio)
                 scores.append(np.mean(score_list))
             if use_win_ratio:
                 result_list.append(np.mean(win_ratios))
@@ -688,48 +687,5 @@ def fetch_results() -> dict[Language, pd.DataFrame]:
     return results_dfs
-def scores_statistically_better(
-    score_values_1: list[float], score_values_2: list[float]
-) -> bool:
-    """Determine whether the first score group is statistically better than the second.
-    Args:
-        score_values_1:
-            The scores for the first group.
-        score_values_2:
-            The scores for the second group.
-    Returns:
-        Whether the first score group is statistically better than the second.
-    """
-    assert len(score_values_1) == len(score_values_2), (
-        "The two score groups must have the same length."
-    )
-    # Separate the scores into groups of 10, consisting of the scores for each
-    # dataset
-    group_scores_1 = [
-        score_values_1[idx:idx+10] for idx in range(0, len(score_values_1), 10)
-    ]
-    group_scores_2 = [
-        score_values_2[idx:idx+10] for idx in range(0, len(score_values_2), 10)
-    ]
-    # Compute t-statistics for each group separately, and compute the mean
-    # t-statistic
-    t_statistics = [
-        stats.ttest_ind(a=group_1, b=group_2, alternative="greater").statistic
-        for group_1, group_2 in zip(group_scores_1, group_scores_2)
-    ]
-    mean_t_statistic = np.mean(t_statistics)
-    # Compute the p-value for the mean t-statistic, where the null hypothesis is
-    # that the first group does not have a larger mean score than the second group
-    degrees_of_freedom = len(score_values_1) - 1
-    p_value = 1 - stats.t.cdf(abs(mean_t_statistic), degrees_of_freedom)
-    return p_value < 0.05
 if __name__ == "__main__":
     main()

             for language in languages:
                 if model_id not in results_dfs_filtered[language].index:
                     continue
                 score_list = results_dfs_filtered[language].loc[model_id][task]
                 win_ratio = 100 * np.mean([
+                    stats.ttest_rel(
+                        a=score_list, b=other_scores, alternative="greater"
+                    ).pvalue < 0.05
                     for other_scores in results_dfs_filtered[language][task].dropna().drop(index=model_id)
                 ])
                 win_ratios.append(win_ratio)
+                if all(score < 1 for score in score_list):
+                    score_list = [100 * score for score in score_list]
                 scores.append(np.mean(score_list))
             if use_win_ratio:
                 result_list.append(np.mean(win_ratios))
     return results_dfs
 if __name__ == "__main__":
     main()