edit
Browse files- leaderboard.py +16 -0
- main.py +54 -41
- rank_through_time.py +1 -0
leaderboard.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
| 1 |
import pandas as pd
|
| 2 |
|
|
|
|
|
|
|
| 3 |
|
| 4 |
def compute_leaderboard(df: pd.DataFrame) -> pd.DataFrame:
|
| 5 |
"""Compute average rank per model for each metric.
|
|
@@ -72,9 +74,23 @@ def compute_leaderboard(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 72 |
elif col.startswith("avg "):
|
| 73 |
leaderboard[col] = leaderboard[col].round(4)
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
return leaderboard
|
| 76 |
|
| 77 |
|
|
|
|
|
|
|
| 78 |
if __name__ == "__main__":
|
| 79 |
df = pd.read_csv("mock_evaluation_results.csv")
|
| 80 |
lb = compute_leaderboard(df)
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
|
| 3 |
+
MEDALS = {0: "π₯", 1: "π₯", 2: "π₯"}
|
| 4 |
+
|
| 5 |
|
| 6 |
def compute_leaderboard(df: pd.DataFrame) -> pd.DataFrame:
|
| 7 |
"""Compute average rank per model for each metric.
|
|
|
|
| 74 |
elif col.startswith("avg "):
|
| 75 |
leaderboard[col] = leaderboard[col].round(4)
|
| 76 |
|
| 77 |
+
# Add medals to model names
|
| 78 |
+
leaderboard = leaderboard.reset_index(drop=True)
|
| 79 |
+
leaderboard["model"] = [
|
| 80 |
+
f"{MEDALS.get(i, '')} {m}".strip()
|
| 81 |
+
for i, m in enumerate(leaderboard["model"])
|
| 82 |
+
]
|
| 83 |
+
|
| 84 |
+
# Reorder: model, avg columns, rank columns
|
| 85 |
+
avg_cols = sorted(c for c in leaderboard.columns if c.startswith("avg "))
|
| 86 |
+
rank_cols = sorted(c for c in leaderboard.columns if c.startswith("rank "))
|
| 87 |
+
leaderboard = leaderboard[["model"] + avg_cols + rank_cols]
|
| 88 |
+
|
| 89 |
return leaderboard
|
| 90 |
|
| 91 |
|
| 92 |
+
|
| 93 |
+
|
| 94 |
if __name__ == "__main__":
|
| 95 |
df = pd.read_csv("mock_evaluation_results.csv")
|
| 96 |
lb = compute_leaderboard(df)
|
main.py
CHANGED
|
@@ -34,52 +34,34 @@ def build_table(metric, subdataset, models):
|
|
| 34 |
def build_plots(metric, subdataset):
|
| 35 |
fig_rank = plot_rank_for_subdataset(df, metric, subdataset)
|
| 36 |
fig_value = plot_value_for_subdataset(df, metric, subdataset)
|
| 37 |
-
|
| 38 |
-
ret = fig_rank, fig_value
|
| 39 |
-
return ret
|
| 40 |
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
with gr.Blocks(title="Impermanent Leaderboard") as app:
|
| 43 |
gr.Markdown("# Impermanent Leaderboard")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
-
with gr.Tab("Leaderboard"):
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
with gr.Row():
|
| 53 |
-
metric_dd = gr.Dropdown(
|
| 54 |
-
choices=ALL_METRICS,
|
| 55 |
-
value=ALL_METRICS[0],
|
| 56 |
-
label="Metric",
|
| 57 |
-
)
|
| 58 |
-
subdataset_dd = gr.Dropdown(
|
| 59 |
-
choices=["All"] + ALL_SUBDATASETS,
|
| 60 |
-
value="All",
|
| 61 |
-
label="Subdataset",
|
| 62 |
-
)
|
| 63 |
-
models_dd = gr.Dropdown(
|
| 64 |
-
choices=ALL_MODELS,
|
| 65 |
-
value=ALL_MODELS,
|
| 66 |
-
multiselect=True,
|
| 67 |
-
label="Models",
|
| 68 |
-
)
|
| 69 |
-
|
| 70 |
-
results_table = gr.Dataframe(
|
| 71 |
-
value=build_table(ALL_METRICS[0], "All", ALL_MODELS),
|
| 72 |
-
label="Results",
|
| 73 |
)
|
| 74 |
|
| 75 |
-
|
| 76 |
-
control.change(
|
| 77 |
-
fn=build_table,
|
| 78 |
-
inputs=[metric_dd, subdataset_dd, models_dd],
|
| 79 |
-
outputs=results_table,
|
| 80 |
-
)
|
| 81 |
-
|
| 82 |
-
with gr.Tab("Results over time"):
|
| 83 |
with gr.Row():
|
| 84 |
time_metric_dd = gr.Dropdown(
|
| 85 |
choices=ALL_METRICS,
|
|
@@ -99,7 +81,6 @@ with gr.Blocks(title="Impermanent Leaderboard") as app:
|
|
| 99 |
fig_rank, fig_value = build_plots(metric, subdataset)
|
| 100 |
return fig_rank, fig_value
|
| 101 |
|
| 102 |
-
# Initial render
|
| 103 |
app.load(
|
| 104 |
fn=update_plots,
|
| 105 |
inputs=[time_metric_dd, time_subdataset_dd],
|
|
@@ -113,5 +94,37 @@ with gr.Blocks(title="Impermanent Leaderboard") as app:
|
|
| 113 |
outputs=[rank_plot, value_plot],
|
| 114 |
)
|
| 115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
if __name__ == "__main__":
|
| 117 |
-
app.launch()
|
|
|
|
| 34 |
def build_plots(metric, subdataset):
|
| 35 |
fig_rank = plot_rank_for_subdataset(df, metric, subdataset)
|
| 36 |
fig_value = plot_value_for_subdataset(df, metric, subdataset)
|
| 37 |
+
return fig_rank, fig_value
|
|
|
|
|
|
|
| 38 |
|
| 39 |
|
| 40 |
+
HEADER_CSS = """\
|
| 41 |
+
.table-wrap thead th {
|
| 42 |
+
background-color: #e2e8f0 !important;
|
| 43 |
+
}
|
| 44 |
+
"""
|
| 45 |
+
|
| 46 |
with gr.Blocks(title="Impermanent Leaderboard") as app:
|
| 47 |
gr.Markdown("# Impermanent Leaderboard")
|
| 48 |
+
gr.Markdown(
|
| 49 |
+
"A **live** time-series forecasting benchmark designed to avoid data contamination. "
|
| 50 |
+
"Automated pipelines continuously fetch fresh data from GitHub β including the number of "
|
| 51 |
+
"open issues, opened PRs, pushes, and stars β ensuring that models are always evaluated "
|
| 52 |
+
"on data they could not have seen during training."
|
| 53 |
+
)
|
| 54 |
|
| 55 |
+
with gr.Tab("Leaderboard π"):
|
| 56 |
+
lb = compute_leaderboard(df)
|
| 57 |
+
gr.Dataframe(
|
| 58 |
+
value=lb,
|
| 59 |
+
#label="Leaderboard",
|
| 60 |
+
interactive=False,
|
| 61 |
+
headers=[f"**{c}**" for c in lb.columns],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
)
|
| 63 |
|
| 64 |
+
with gr.Tab("Results over time π"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
with gr.Row():
|
| 66 |
time_metric_dd = gr.Dropdown(
|
| 67 |
choices=ALL_METRICS,
|
|
|
|
| 81 |
fig_rank, fig_value = build_plots(metric, subdataset)
|
| 82 |
return fig_rank, fig_value
|
| 83 |
|
|
|
|
| 84 |
app.load(
|
| 85 |
fn=update_plots,
|
| 86 |
inputs=[time_metric_dd, time_subdataset_dd],
|
|
|
|
| 94 |
outputs=[rank_plot, value_plot],
|
| 95 |
)
|
| 96 |
|
| 97 |
+
with gr.Tab("All results π"):
|
| 98 |
+
with gr.Row():
|
| 99 |
+
metric_dd = gr.Dropdown(
|
| 100 |
+
choices=ALL_METRICS,
|
| 101 |
+
value=ALL_METRICS[0],
|
| 102 |
+
label="Metric",
|
| 103 |
+
)
|
| 104 |
+
subdataset_dd = gr.Dropdown(
|
| 105 |
+
choices=["All"] + ALL_SUBDATASETS,
|
| 106 |
+
value="All",
|
| 107 |
+
label="Subdataset",
|
| 108 |
+
)
|
| 109 |
+
models_dd = gr.Dropdown(
|
| 110 |
+
choices=ALL_MODELS,
|
| 111 |
+
value=ALL_MODELS,
|
| 112 |
+
multiselect=True,
|
| 113 |
+
label="Models",
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
results_table = gr.Dataframe(
|
| 117 |
+
value=build_table(ALL_METRICS[0], "All", ALL_MODELS),
|
| 118 |
+
label="Results",
|
| 119 |
+
interactive=False,
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
for control in [metric_dd, subdataset_dd, models_dd]:
|
| 123 |
+
control.change(
|
| 124 |
+
fn=build_table,
|
| 125 |
+
inputs=[metric_dd, subdataset_dd, models_dd],
|
| 126 |
+
outputs=results_table,
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
if __name__ == "__main__":
|
| 130 |
+
app.launch(css=HEADER_CSS)
|
rank_through_time.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import matplotlib
|
| 2 |
matplotlib.use("Agg")
|
|
|
|
| 3 |
import pathlib
|
| 4 |
import pandas as pd
|
| 5 |
import matplotlib.pyplot as plt
|
|
|
|
| 1 |
import matplotlib
|
| 2 |
matplotlib.use("Agg")
|
| 3 |
+
matplotlib.rcParams["figure.dpi"] = 150
|
| 4 |
import pathlib
|
| 5 |
import pandas as pd
|
| 6 |
import matplotlib.pyplot as plt
|