Spaces:
Running
Running
Commit
·
d425853
1
Parent(s):
cb51391
add: more metric cols
Browse files- data/tabarena_leaderboard.csv.zip +2 -2
- main.py +68 -16
data/tabarena_leaderboard.csv.zip
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7b23c724927320a54d5e4edcf1b2d938bc818c1dfd5461f1a8d204bb0b44d095
|
| 3 |
+
size 10582
|
main.py
CHANGED
|
@@ -26,6 +26,9 @@ tuned configurations. Each model is implemented in a tested real-world pipeline
|
|
| 26 |
optimized to get the most out of the model by the maintainers of TabArena, and where
|
| 27 |
possible together with the authors of the model.
|
| 28 |
|
|
|
|
|
|
|
|
|
|
| 29 |
**Reference Pipeline:** The leaderboard includes a reference pipeline, which is applied
|
| 30 |
independently of the tuning protocol and constraints we constructed for models within TabArena.
|
| 31 |
The reference pipeline aims to represent the performance quickly achievable by a
|
|
@@ -39,22 +42,68 @@ The current leaderboard is based on TabArena-v0.1.
|
|
| 39 |
|
| 40 |
|
| 41 |
ABOUT_TEXT = """
|
|
|
|
|
|
|
|
|
|
| 42 |
## Using TabArena for Benchmarking
|
| 43 |
To compare your own methods to the pre-computed results for all models on the leaderboard,
|
| 44 |
you can use the TabArena framework. For examples on how to use TabArena for benchmarking,
|
| 45 |
please see https://github.com/TabArena/tabarena_benchmarking_examples
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
## Contributing Data
|
| 48 |
For anything related to the datasets used in TabArena, please see https://github.com/TabArena/tabarena_dataset_curation
|
| 49 |
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
-
|
| 55 |
-
|
| 56 |
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
The current core maintainers of TabArena are:
|
| 59 |
[Nick Erickson](https://github.com/Innixma),
|
| 60 |
[Lennart Purucker](https://github.com/LennartPurucker/),
|
|
@@ -139,6 +188,9 @@ def load_data(filename: str):
|
|
| 139 |
+ df_leaderboard["elo-"].round(0).astype(int).astype(str)
|
| 140 |
)
|
| 141 |
# select only the columns we want to display
|
|
|
|
|
|
|
|
|
|
| 142 |
df_leaderboard = df_leaderboard.loc[
|
| 143 |
:,
|
| 144 |
[
|
|
@@ -147,8 +199,10 @@ def load_data(filename: str):
|
|
| 147 |
"method",
|
| 148 |
"elo",
|
| 149 |
"Elo 95% CI",
|
|
|
|
| 150 |
"rank",
|
| 151 |
-
"
|
|
|
|
| 152 |
"median_time_train_s_per_1K",
|
| 153 |
"median_time_infer_s_per_1K",
|
| 154 |
],
|
|
@@ -158,11 +212,11 @@ def load_data(filename: str):
|
|
| 158 |
df_leaderboard[["elo", "Elo 95% CI"]] = df_leaderboard[["elo", "Elo 95% CI"]].round(
|
| 159 |
0
|
| 160 |
)
|
| 161 |
-
df_leaderboard[["median_time_train_s_per_1K", "rank"]] = df_leaderboard[
|
| 162 |
-
["median_time_train_s_per_1K", "rank"]
|
| 163 |
].round(2)
|
| 164 |
-
df_leaderboard[["normalized-
|
| 165 |
-
["normalized-
|
| 166 |
].round(3)
|
| 167 |
|
| 168 |
df_leaderboard = df_leaderboard.sort_values(by="elo", ascending=False)
|
|
@@ -177,14 +231,12 @@ def load_data(filename: str):
|
|
| 177 |
"method": "Model",
|
| 178 |
"elo": "Elo [⬆️]",
|
| 179 |
"rank": "Rank [⬇️]",
|
| 180 |
-
"normalized-
|
|
|
|
|
|
|
| 181 |
}
|
| 182 |
)
|
| 183 |
|
| 184 |
-
# TODO show ELO +/- sem
|
| 185 |
-
# TODO: rename and re-order columns
|
| 186 |
-
|
| 187 |
-
|
| 188 |
def make_leaderboard(df_leaderboard: pd.DataFrame) -> Leaderboard:
|
| 189 |
df_leaderboard["TypeFiler"] = df_leaderboard["TypeName"].apply(
|
| 190 |
lambda m: f"{m} {model_type_emoji[m]}"
|
|
|
|
| 26 |
optimized to get the most out of the model by the maintainers of TabArena, and where
|
| 27 |
possible together with the authors of the model.
|
| 28 |
|
| 29 |
+
**Metrics:** The leaderboard is ranked based on Elo. We present several additional
|
| 30 |
+
metrics. See the `About` tab for more information on the metrics.
|
| 31 |
+
|
| 32 |
**Reference Pipeline:** The leaderboard includes a reference pipeline, which is applied
|
| 33 |
independently of the tuning protocol and constraints we constructed for models within TabArena.
|
| 34 |
The reference pipeline aims to represent the performance quickly achievable by a
|
|
|
|
| 42 |
|
| 43 |
|
| 44 |
ABOUT_TEXT = """
|
| 45 |
+
TabArena is a living benchmark system for predictive machine learning on tabular data.
|
| 46 |
+
We introduce TabArena and provide an overview of TabArena-v0.1 in our paper: TBA.
|
| 47 |
+
|
| 48 |
## Using TabArena for Benchmarking
|
| 49 |
To compare your own methods to the pre-computed results for all models on the leaderboard,
|
| 50 |
you can use the TabArena framework. For examples on how to use TabArena for benchmarking,
|
| 51 |
please see https://github.com/TabArena/tabarena_benchmarking_examples
|
| 52 |
|
| 53 |
+
## Contributing to the Leaderboard; Contributing Models
|
| 54 |
+
For guidelines on how to contribute your model to TabArena, or the result of your model
|
| 55 |
+
to the official leaderboard, please see the appendix of our paper: TBA.
|
| 56 |
+
|
| 57 |
## Contributing Data
|
| 58 |
For anything related to the datasets used in TabArena, please see https://github.com/TabArena/tabarena_dataset_curation
|
| 59 |
|
| 60 |
+
---
|
| 61 |
+
|
| 62 |
+
## Leaderboard Documentation
|
| 63 |
+
|
| 64 |
+
The leaderboard is ranked by Elo and includes several other metrics. Here is a short
|
| 65 |
+
description for these metrics:
|
| 66 |
+
|
| 67 |
+
#### Elo
|
| 68 |
+
We evaluate models using the Elo rating system, following Chatbot Arena. Elo is a
|
| 69 |
+
pairwise comparison-based rating system where each model's rating predicts its expected
|
| 70 |
+
win probability against others, with a 400-point Elo gap corresponding to a 10 to 1
|
| 71 |
+
(91\%) expected win rate. We calibrate 1000 Elo to the performance of our default
|
| 72 |
+
random forest configuration across all figures, and perform 100 rounds of bootstrapping
|
| 73 |
+
to obtain 95\% confidence intervals. Elo scores are computed using ROC AUC for binary
|
| 74 |
+
classification, log-loss for multiclass classification, and RMSE for regression.
|
| 75 |
+
|
| 76 |
+
#### Normalized Score
|
| 77 |
+
Following TabRepo, we linearly rescale the error such that the best method has a
|
| 78 |
+
normalized score of one, and the median method has a normalized score of 0. Scores
|
| 79 |
+
below zero are clipped to zero. These scores are then averaged across datasets.
|
| 80 |
|
| 81 |
+
#### Average Rank
|
| 82 |
+
Ranks of methods are computed on each dataset (lower is better) and averaged.
|
| 83 |
|
| 84 |
+
#### Harmonic Mean Rank
|
| 85 |
+
Taking the harmonic mean of ranks, 1/((1/N) * sum(1/rank_i for i in range(N))),
|
| 86 |
+
more strongly favors methods having very low ranks on some datasets. It therefore favors
|
| 87 |
+
methods that are sometimes very good and sometimes very bad over methods that are
|
| 88 |
+
always mediocre, as the former are more likely to be useful in conjunction with
|
| 89 |
+
other methods.
|
| 90 |
+
|
| 91 |
+
#### Improvability
|
| 92 |
+
We introduce improvability as a metric that measures how many percent lower the error
|
| 93 |
+
of the best method is than the current method on a dataset. This is then averaged over
|
| 94 |
+
datasets. Formally, for a single dataset improvability is (err_i - besterr_i)/err_i * 100\%.
|
| 95 |
+
Improvability is always between $0\%$ and $100\%$.
|
| 96 |
+
|
| 97 |
+
---
|
| 98 |
+
|
| 99 |
+
## Contact
|
| 100 |
+
|
| 101 |
+
For most inquires, please open issues in the relevant GitHub repository or here on
|
| 102 |
+
HuggingFace.
|
| 103 |
+
|
| 104 |
+
For any other inquiries related to TabArena, please reach out to: contact@tabarena.ai
|
| 105 |
+
|
| 106 |
+
### Core Maintainers
|
| 107 |
The current core maintainers of TabArena are:
|
| 108 |
[Nick Erickson](https://github.com/Innixma),
|
| 109 |
[Lennart Purucker](https://github.com/LennartPurucker/),
|
|
|
|
| 188 |
+ df_leaderboard["elo-"].round(0).astype(int).astype(str)
|
| 189 |
)
|
| 190 |
# select only the columns we want to display
|
| 191 |
+
df_leaderboard["normalized-score"] = 1 - df_leaderboard["normalized-error"]
|
| 192 |
+
df_leaderboard["hmr"] = 1/df_leaderboard["mrr"]
|
| 193 |
+
df_leaderboard["improvability"] = 100 * df_leaderboard["champ_delta"]
|
| 194 |
df_leaderboard = df_leaderboard.loc[
|
| 195 |
:,
|
| 196 |
[
|
|
|
|
| 199 |
"method",
|
| 200 |
"elo",
|
| 201 |
"Elo 95% CI",
|
| 202 |
+
"normalized-score",
|
| 203 |
"rank",
|
| 204 |
+
"hmr",
|
| 205 |
+
"improvability",
|
| 206 |
"median_time_train_s_per_1K",
|
| 207 |
"median_time_infer_s_per_1K",
|
| 208 |
],
|
|
|
|
| 212 |
df_leaderboard[["elo", "Elo 95% CI"]] = df_leaderboard[["elo", "Elo 95% CI"]].round(
|
| 213 |
0
|
| 214 |
)
|
| 215 |
+
df_leaderboard[["median_time_train_s_per_1K", "rank", "hmr"]] = df_leaderboard[
|
| 216 |
+
["median_time_train_s_per_1K", "rank", "hmr"]
|
| 217 |
].round(2)
|
| 218 |
+
df_leaderboard[["normalized-score", "median_time_infer_s_per_1K", "improvability"]] = df_leaderboard[
|
| 219 |
+
["normalized-score", "median_time_infer_s_per_1K", "improvability"]
|
| 220 |
].round(3)
|
| 221 |
|
| 222 |
df_leaderboard = df_leaderboard.sort_values(by="elo", ascending=False)
|
|
|
|
| 231 |
"method": "Model",
|
| 232 |
"elo": "Elo [⬆️]",
|
| 233 |
"rank": "Rank [⬇️]",
|
| 234 |
+
"normalized-score": "Normalized Score [⬆️]",
|
| 235 |
+
"hmr": "Harmonic Mean Rank [⬇️]",
|
| 236 |
+
"improvability": "Improvability (%) [⬇️]",
|
| 237 |
}
|
| 238 |
)
|
| 239 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
def make_leaderboard(df_leaderboard: pd.DataFrame) -> Leaderboard:
|
| 241 |
df_leaderboard["TypeFiler"] = df_leaderboard["TypeName"].apply(
|
| 242 |
lambda m: f"{m} {model_type_emoji[m]}"
|