Spaces:
Running
Running
Amber Tanaka
commited on
Switch default tab from validation to test (#12)
Browse files- c_and_e.py +22 -23
- data_analysis.py +21 -21
- e2e.py +21 -20
- leaderboard_transformer.py +2 -2
- literature_understanding.py +22 -22
- main_page.py +14 -16
- ui_components.py +10 -2
c_and_e.py
CHANGED
|
@@ -12,15 +12,33 @@ with gr.Blocks() as demo:
|
|
| 12 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 13 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 14 |
gr.Markdown(CODE_EXECUTION_DESCRIPTION, elem_id="category-intro")
|
| 15 |
-
with gr.Column(elem_id="validation_nav_container", visible=
|
| 16 |
create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
|
| 17 |
-
with gr.Column(elem_id="test_nav_container", visible=
|
| 18 |
create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
|
| 19 |
|
| 20 |
|
| 21 |
# --- This page now has two main sections: Validation and Test ---
|
| 22 |
with gr.Tabs():
|
| 23 |
-
with gr.Tab("Results:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
# 1. Load all necessary data for the "validation" split ONCE.
|
| 25 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 26 |
|
|
@@ -42,29 +60,11 @@ with gr.Blocks() as demo:
|
|
| 42 |
else:
|
| 43 |
gr.Markdown("No data available for validation split.")
|
| 44 |
|
| 45 |
-
with gr.Tab("Results: Test") as test_tab:
|
| 46 |
-
# Repeat the process for the "test" split
|
| 47 |
-
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 48 |
-
|
| 49 |
-
if not test_df.empty:
|
| 50 |
-
create_leaderboard_display(
|
| 51 |
-
full_df=test_df,
|
| 52 |
-
tag_map=test_tag_map,
|
| 53 |
-
category_name=CATEGORY_NAME,
|
| 54 |
-
split_name="test"
|
| 55 |
-
)
|
| 56 |
-
create_benchmark_details_display(
|
| 57 |
-
full_df=test_df,
|
| 58 |
-
tag_map=test_tag_map,
|
| 59 |
-
category_name=CATEGORY_NAME
|
| 60 |
-
)
|
| 61 |
-
else:
|
| 62 |
-
gr.Markdown("No data available for test split.")
|
| 63 |
-
|
| 64 |
show_validation_js = """
|
| 65 |
() => {
|
| 66 |
document.getElementById('validation_nav_container').style.display = 'block';
|
| 67 |
document.getElementById('test_nav_container').style.display = 'none';
|
|
|
|
| 68 |
}
|
| 69 |
"""
|
| 70 |
|
|
@@ -73,7 +73,6 @@ with gr.Blocks() as demo:
|
|
| 73 |
() => {
|
| 74 |
document.getElementById('validation_nav_container').style.display = 'none';
|
| 75 |
document.getElementById('test_nav_container').style.display = 'block';
|
| 76 |
-
setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);
|
| 77 |
}
|
| 78 |
"""
|
| 79 |
|
|
|
|
| 12 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 13 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 14 |
gr.Markdown(CODE_EXECUTION_DESCRIPTION, elem_id="category-intro")
|
| 15 |
+
with gr.Column(elem_id="validation_nav_container", visible=False) as validation_nav_container:
|
| 16 |
create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
|
| 17 |
+
with gr.Column(elem_id="test_nav_container", visible=True) as test_nav_container:
|
| 18 |
create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
|
| 19 |
|
| 20 |
|
| 21 |
# --- This page now has two main sections: Validation and Test ---
|
| 22 |
with gr.Tabs():
|
| 23 |
+
with gr.Tab("Results: Test Set") as test_tab:
|
| 24 |
+
# Repeat the process for the "test" split
|
| 25 |
+
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 26 |
+
|
| 27 |
+
if not test_df.empty:
|
| 28 |
+
create_leaderboard_display(
|
| 29 |
+
full_df=test_df,
|
| 30 |
+
tag_map=test_tag_map,
|
| 31 |
+
category_name=CATEGORY_NAME,
|
| 32 |
+
split_name="test"
|
| 33 |
+
)
|
| 34 |
+
create_benchmark_details_display(
|
| 35 |
+
full_df=test_df,
|
| 36 |
+
tag_map=test_tag_map,
|
| 37 |
+
category_name=CATEGORY_NAME
|
| 38 |
+
)
|
| 39 |
+
else:
|
| 40 |
+
gr.Markdown("No data available for test split.")
|
| 41 |
+
with gr.Tab("Results: Validation Set") as validation_tab:
|
| 42 |
# 1. Load all necessary data for the "validation" split ONCE.
|
| 43 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 44 |
|
|
|
|
| 60 |
else:
|
| 61 |
gr.Markdown("No data available for validation split.")
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
show_validation_js = """
|
| 64 |
() => {
|
| 65 |
document.getElementById('validation_nav_container').style.display = 'block';
|
| 66 |
document.getElementById('test_nav_container').style.display = 'none';
|
| 67 |
+
setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);
|
| 68 |
}
|
| 69 |
"""
|
| 70 |
|
|
|
|
| 73 |
() => {
|
| 74 |
document.getElementById('validation_nav_container').style.display = 'none';
|
| 75 |
document.getElementById('test_nav_container').style.display = 'block';
|
|
|
|
| 76 |
}
|
| 77 |
"""
|
| 78 |
|
data_analysis.py
CHANGED
|
@@ -12,12 +12,30 @@ with gr.Blocks() as demo:
|
|
| 12 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 13 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 14 |
gr.Markdown(DATA_ANALYSIS_DESCRIPTION, elem_id="category-intro")
|
| 15 |
-
with gr.Column(elem_id="validation_nav_container", visible=
|
| 16 |
create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
|
| 17 |
-
with gr.Column(elem_id="test_nav_container", visible=
|
| 18 |
create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
|
| 19 |
# --- This page now has two main sections: Validation and Test ---
|
| 20 |
with gr.Tabs():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
with gr.Tab("Results: Validation") as validation_tab:
|
| 22 |
# 1. Load all necessary data for the "validation" split ONCE.
|
| 23 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
|
@@ -40,29 +58,12 @@ with gr.Blocks() as demo:
|
|
| 40 |
else:
|
| 41 |
gr.Markdown("No data available for validation split.")
|
| 42 |
|
| 43 |
-
with gr.Tab("Results: Test") as test_tab:
|
| 44 |
-
# Repeat the process for the "test" split
|
| 45 |
-
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 46 |
-
|
| 47 |
-
if not test_df.empty:
|
| 48 |
-
create_leaderboard_display(
|
| 49 |
-
full_df=test_df,
|
| 50 |
-
tag_map=test_tag_map,
|
| 51 |
-
category_name=CATEGORY_NAME,
|
| 52 |
-
split_name="test"
|
| 53 |
-
)
|
| 54 |
-
create_benchmark_details_display(
|
| 55 |
-
full_df=test_df,
|
| 56 |
-
tag_map=test_tag_map,
|
| 57 |
-
category_name=CATEGORY_NAME
|
| 58 |
-
)
|
| 59 |
-
else:
|
| 60 |
-
gr.Markdown("No data available for test split.")
|
| 61 |
|
| 62 |
show_validation_js = """
|
| 63 |
() => {
|
| 64 |
document.getElementById('validation_nav_container').style.display = 'block';
|
| 65 |
document.getElementById('test_nav_container').style.display = 'none';
|
|
|
|
| 66 |
}
|
| 67 |
"""
|
| 68 |
|
|
@@ -71,7 +72,6 @@ with gr.Blocks() as demo:
|
|
| 71 |
() => {
|
| 72 |
document.getElementById('validation_nav_container').style.display = 'none';
|
| 73 |
document.getElementById('test_nav_container').style.display = 'block';
|
| 74 |
-
setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);
|
| 75 |
}
|
| 76 |
"""
|
| 77 |
|
|
|
|
| 12 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 13 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 14 |
gr.Markdown(DATA_ANALYSIS_DESCRIPTION, elem_id="category-intro")
|
| 15 |
+
with gr.Column(elem_id="validation_nav_container", visible=False) as validation_nav_container:
|
| 16 |
create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
|
| 17 |
+
with gr.Column(elem_id="test_nav_container", visible=True) as test_nav_container:
|
| 18 |
create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
|
| 19 |
# --- This page now has two main sections: Validation and Test ---
|
| 20 |
with gr.Tabs():
|
| 21 |
+
with gr.Tab("Results: Test") as test_tab:
|
| 22 |
+
# Repeat the process for the "test" split
|
| 23 |
+
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 24 |
+
|
| 25 |
+
if not test_df.empty:
|
| 26 |
+
create_leaderboard_display(
|
| 27 |
+
full_df=test_df,
|
| 28 |
+
tag_map=test_tag_map,
|
| 29 |
+
category_name=CATEGORY_NAME,
|
| 30 |
+
split_name="test"
|
| 31 |
+
)
|
| 32 |
+
create_benchmark_details_display(
|
| 33 |
+
full_df=test_df,
|
| 34 |
+
tag_map=test_tag_map,
|
| 35 |
+
category_name=CATEGORY_NAME
|
| 36 |
+
)
|
| 37 |
+
else:
|
| 38 |
+
gr.Markdown("No data available for test split.")
|
| 39 |
with gr.Tab("Results: Validation") as validation_tab:
|
| 40 |
# 1. Load all necessary data for the "validation" split ONCE.
|
| 41 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
|
|
|
| 58 |
else:
|
| 59 |
gr.Markdown("No data available for validation split.")
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
show_validation_js = """
|
| 63 |
() => {
|
| 64 |
document.getElementById('validation_nav_container').style.display = 'block';
|
| 65 |
document.getElementById('test_nav_container').style.display = 'none';
|
| 66 |
+
setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);
|
| 67 |
}
|
| 68 |
"""
|
| 69 |
|
|
|
|
| 72 |
() => {
|
| 73 |
document.getElementById('validation_nav_container').style.display = 'none';
|
| 74 |
document.getElementById('test_nav_container').style.display = 'block';
|
|
|
|
| 75 |
}
|
| 76 |
"""
|
| 77 |
|
e2e.py
CHANGED
|
@@ -12,12 +12,30 @@ with gr.Blocks() as demo:
|
|
| 12 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 13 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 14 |
gr.Markdown(DISCOVERY_DESCRIPTION, elem_id="category-intro")
|
| 15 |
-
with gr.Column(elem_id="validation_nav_container", visible=
|
| 16 |
create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
|
| 17 |
-
with gr.Column(elem_id="test_nav_container", visible=
|
| 18 |
create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
|
| 19 |
# --- This page now has two main sections: Validation and Test ---
|
| 20 |
with gr.Tabs():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
with gr.Tab("Results: Validation") as validation_tab:
|
| 22 |
# 1. Load all necessary data for the "validation" split ONCE.
|
| 23 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
|
@@ -40,28 +58,12 @@ with gr.Blocks() as demo:
|
|
| 40 |
else:
|
| 41 |
gr.Markdown("No data available for validation split.")
|
| 42 |
|
| 43 |
-
with gr.Tab("Results: Test") as test_tab:
|
| 44 |
-
# Repeat the process for the "test" split
|
| 45 |
-
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 46 |
|
| 47 |
-
if not test_df.empty:
|
| 48 |
-
create_leaderboard_display(
|
| 49 |
-
full_df=test_df,
|
| 50 |
-
tag_map=test_tag_map,
|
| 51 |
-
category_name=CATEGORY_NAME,
|
| 52 |
-
split_name="test"
|
| 53 |
-
)
|
| 54 |
-
create_benchmark_details_display(
|
| 55 |
-
full_df=test_df,
|
| 56 |
-
tag_map=test_tag_map,
|
| 57 |
-
category_name=CATEGORY_NAME
|
| 58 |
-
)
|
| 59 |
-
else:
|
| 60 |
-
gr.Markdown("No data available for test split.")
|
| 61 |
show_validation_js = """
|
| 62 |
() => {
|
| 63 |
document.getElementById('validation_nav_container').style.display = 'block';
|
| 64 |
document.getElementById('test_nav_container').style.display = 'none';
|
|
|
|
| 65 |
}
|
| 66 |
"""
|
| 67 |
|
|
@@ -70,7 +72,6 @@ with gr.Blocks() as demo:
|
|
| 70 |
() => {
|
| 71 |
document.getElementById('validation_nav_container').style.display = 'none';
|
| 72 |
document.getElementById('test_nav_container').style.display = 'block';
|
| 73 |
-
setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);
|
| 74 |
}
|
| 75 |
"""
|
| 76 |
|
|
|
|
| 12 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 13 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 14 |
gr.Markdown(DISCOVERY_DESCRIPTION, elem_id="category-intro")
|
| 15 |
+
with gr.Column(elem_id="validation_nav_container", visible=False) as validation_nav_container:
|
| 16 |
create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
|
| 17 |
+
with gr.Column(elem_id="test_nav_container", visible=True) as test_nav_container:
|
| 18 |
create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
|
| 19 |
# --- This page now has two main sections: Validation and Test ---
|
| 20 |
with gr.Tabs():
|
| 21 |
+
with gr.Tab("Results: Test") as test_tab:
|
| 22 |
+
# Repeat the process for the "test" split
|
| 23 |
+
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 24 |
+
|
| 25 |
+
if not test_df.empty:
|
| 26 |
+
create_leaderboard_display(
|
| 27 |
+
full_df=test_df,
|
| 28 |
+
tag_map=test_tag_map,
|
| 29 |
+
category_name=CATEGORY_NAME,
|
| 30 |
+
split_name="test"
|
| 31 |
+
)
|
| 32 |
+
create_benchmark_details_display(
|
| 33 |
+
full_df=test_df,
|
| 34 |
+
tag_map=test_tag_map,
|
| 35 |
+
category_name=CATEGORY_NAME
|
| 36 |
+
)
|
| 37 |
+
else:
|
| 38 |
+
gr.Markdown("No data available for test split.")
|
| 39 |
with gr.Tab("Results: Validation") as validation_tab:
|
| 40 |
# 1. Load all necessary data for the "validation" split ONCE.
|
| 41 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
|
|
|
| 58 |
else:
|
| 59 |
gr.Markdown("No data available for validation split.")
|
| 60 |
|
|
|
|
|
|
|
|
|
|
| 61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
show_validation_js = """
|
| 63 |
() => {
|
| 64 |
document.getElementById('validation_nav_container').style.display = 'block';
|
| 65 |
document.getElementById('test_nav_container').style.display = 'none';
|
| 66 |
+
setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);
|
| 67 |
}
|
| 68 |
"""
|
| 69 |
|
|
|
|
| 72 |
() => {
|
| 73 |
document.getElementById('validation_nav_container').style.display = 'none';
|
| 74 |
document.getElementById('test_nav_container').style.display = 'block';
|
|
|
|
| 75 |
}
|
| 76 |
"""
|
| 77 |
|
leaderboard_transformer.py
CHANGED
|
@@ -414,7 +414,7 @@ def _plot_scatter_plotly(
|
|
| 414 |
text=group['hover_text'],
|
| 415 |
hoverinfo='text',
|
| 416 |
marker=dict(
|
| 417 |
-
color=color_map.get(category, '
|
| 418 |
symbol=group['shape_symbol'],
|
| 419 |
size=10,
|
| 420 |
opacity=0.8,
|
|
@@ -445,7 +445,7 @@ def _plot_scatter_plotly(
|
|
| 445 |
name=shape_name,
|
| 446 |
legendgroup="tooling_group",
|
| 447 |
legendgrouptitle_text="Agent Tooling" if i == 0 else None,
|
| 448 |
-
marker=dict(color='
|
| 449 |
))
|
| 450 |
|
| 451 |
# --- Section 8: Configure Layout (Restored from your original code) ---
|
|
|
|
| 414 |
text=group['hover_text'],
|
| 415 |
hoverinfo='text',
|
| 416 |
marker=dict(
|
| 417 |
+
color=color_map.get(category, 'black'),
|
| 418 |
symbol=group['shape_symbol'],
|
| 419 |
size=10,
|
| 420 |
opacity=0.8,
|
|
|
|
| 445 |
name=shape_name,
|
| 446 |
legendgroup="tooling_group",
|
| 447 |
legendgrouptitle_text="Agent Tooling" if i == 0 else None,
|
| 448 |
+
marker=dict(color='black', symbol=shape_symbol, size=12)
|
| 449 |
))
|
| 450 |
|
| 451 |
# --- Section 8: Configure Layout (Restored from your original code) ---
|
literature_understanding.py
CHANGED
|
@@ -13,15 +13,33 @@ with gr.Blocks() as demo:
|
|
| 13 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 14 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 15 |
gr.Markdown(LIT_DESCRIPTION, elem_id="category-intro")
|
| 16 |
-
with gr.Column(elem_id="validation_nav_container", visible=
|
| 17 |
create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
|
| 18 |
|
| 19 |
-
with gr.Column(elem_id="test_nav_container", visible=
|
| 20 |
create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
|
| 21 |
|
| 22 |
# --- This page now has two main sections: Validation and Test ---
|
| 23 |
with gr.Tabs():
|
| 24 |
-
with gr.Tab("Results:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
# 1. Load all necessary data for the "validation" split ONCE.
|
| 26 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 27 |
|
|
@@ -43,29 +61,12 @@ with gr.Blocks() as demo:
|
|
| 43 |
else:
|
| 44 |
gr.Markdown("No data available for validation split.")
|
| 45 |
|
| 46 |
-
with gr.Tab("Results: Test") as test_tab:
|
| 47 |
-
# Repeat the process for the "test" split
|
| 48 |
-
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 49 |
-
|
| 50 |
-
if not test_df.empty:
|
| 51 |
-
create_leaderboard_display(
|
| 52 |
-
full_df=test_df,
|
| 53 |
-
tag_map=test_tag_map,
|
| 54 |
-
category_name=CATEGORY_NAME,
|
| 55 |
-
split_name="test"
|
| 56 |
-
)
|
| 57 |
-
create_benchmark_details_display(
|
| 58 |
-
full_df=test_df,
|
| 59 |
-
tag_map=test_tag_map,
|
| 60 |
-
category_name=CATEGORY_NAME
|
| 61 |
-
)
|
| 62 |
-
else:
|
| 63 |
-
gr.Markdown("No data available for test split.")
|
| 64 |
|
| 65 |
show_validation_js = """
|
| 66 |
() => {
|
| 67 |
document.getElementById('validation_nav_container').style.display = 'block';
|
| 68 |
document.getElementById('test_nav_container').style.display = 'none';
|
|
|
|
| 69 |
}
|
| 70 |
"""
|
| 71 |
|
|
@@ -74,7 +75,6 @@ with gr.Blocks() as demo:
|
|
| 74 |
() => {
|
| 75 |
document.getElementById('validation_nav_container').style.display = 'none';
|
| 76 |
document.getElementById('test_nav_container').style.display = 'block';
|
| 77 |
-
setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);
|
| 78 |
}
|
| 79 |
"""
|
| 80 |
|
|
|
|
| 13 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 14 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 15 |
gr.Markdown(LIT_DESCRIPTION, elem_id="category-intro")
|
| 16 |
+
with gr.Column(elem_id="validation_nav_container", visible=False) as validation_nav_container:
|
| 17 |
create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
|
| 18 |
|
| 19 |
+
with gr.Column(elem_id="test_nav_container", visible=True) as test_nav_container:
|
| 20 |
create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
|
| 21 |
|
| 22 |
# --- This page now has two main sections: Validation and Test ---
|
| 23 |
with gr.Tabs():
|
| 24 |
+
with gr.Tab("Results: Test Set") as test_tab:
|
| 25 |
+
# Repeat the process for the "test" split
|
| 26 |
+
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 27 |
+
|
| 28 |
+
if not test_df.empty:
|
| 29 |
+
create_leaderboard_display(
|
| 30 |
+
full_df=test_df,
|
| 31 |
+
tag_map=test_tag_map,
|
| 32 |
+
category_name=CATEGORY_NAME,
|
| 33 |
+
split_name="test"
|
| 34 |
+
)
|
| 35 |
+
create_benchmark_details_display(
|
| 36 |
+
full_df=test_df,
|
| 37 |
+
tag_map=test_tag_map,
|
| 38 |
+
category_name=CATEGORY_NAME
|
| 39 |
+
)
|
| 40 |
+
else:
|
| 41 |
+
gr.Markdown("No data available for test split.")
|
| 42 |
+
with gr.Tab("Results: Validation Set") as validation_tab:
|
| 43 |
# 1. Load all necessary data for the "validation" split ONCE.
|
| 44 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 45 |
|
|
|
|
| 61 |
else:
|
| 62 |
gr.Markdown("No data available for validation split.")
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
show_validation_js = """
|
| 66 |
() => {
|
| 67 |
document.getElementById('validation_nav_container').style.display = 'block';
|
| 68 |
document.getElementById('test_nav_container').style.display = 'none';
|
| 69 |
+
setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);
|
| 70 |
}
|
| 71 |
"""
|
| 72 |
|
|
|
|
| 75 |
() => {
|
| 76 |
document.getElementById('validation_nav_container').style.display = 'none';
|
| 77 |
document.getElementById('test_nav_container').style.display = 'block';
|
|
|
|
| 78 |
}
|
| 79 |
"""
|
| 80 |
|
main_page.py
CHANGED
|
@@ -23,10 +23,20 @@ with gr.Blocks(fill_width=True) as demo:
|
|
| 23 |
gr.Markdown(f"## Astabench {CATEGORY_NAME} Leaderboard")
|
| 24 |
|
| 25 |
with gr.Tabs() as tabs:
|
| 26 |
-
with gr.Tab("Results:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
# 1. Load all necessary data for the "validation" split ONCE.
|
| 28 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 29 |
-
|
| 30 |
# Check if data was loaded successfully before trying to display it
|
| 31 |
if not validation_df.empty:
|
| 32 |
# 2. Render the display by calling the factory with the loaded data.
|
|
@@ -39,28 +49,16 @@ with gr.Blocks(fill_width=True) as demo:
|
|
| 39 |
else:
|
| 40 |
gr.Markdown("No data available for validation split.")
|
| 41 |
|
| 42 |
-
with gr.Tab("Results: Test") as test_tab:
|
| 43 |
-
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 44 |
-
if not test_df.empty:
|
| 45 |
-
create_leaderboard_display(
|
| 46 |
-
full_df=test_df,
|
| 47 |
-
tag_map=test_tag_map,
|
| 48 |
-
category_name=CATEGORY_NAME, # Use our constant
|
| 49 |
-
split_name="test"
|
| 50 |
-
)
|
| 51 |
-
else:
|
| 52 |
-
gr.Markdown("No data available for test split.")
|
| 53 |
-
|
| 54 |
with gr.Accordion("📙 Citation", open=False):
|
| 55 |
gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button-main", interactive=False)
|
| 56 |
|
| 57 |
|
| 58 |
# JavaScript to show the TEST nav, hide the VALIDATION nav, AND fix the plots.
|
| 59 |
-
|
| 60 |
() => {setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);}
|
| 61 |
"""
|
| 62 |
# Assign the pure JS functions to the select events. No Python `fn` is needed.
|
| 63 |
-
|
| 64 |
|
| 65 |
if __name__ == "__main__":
|
| 66 |
demo.launch()
|
|
|
|
| 23 |
gr.Markdown(f"## Astabench {CATEGORY_NAME} Leaderboard")
|
| 24 |
|
| 25 |
with gr.Tabs() as tabs:
|
| 26 |
+
with gr.Tab("Results: Test Set") as test_tab:
|
| 27 |
+
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 28 |
+
if not test_df.empty:
|
| 29 |
+
create_leaderboard_display(
|
| 30 |
+
full_df=test_df,
|
| 31 |
+
tag_map=test_tag_map,
|
| 32 |
+
category_name=CATEGORY_NAME, # Use our constant
|
| 33 |
+
split_name="test"
|
| 34 |
+
)
|
| 35 |
+
else:
|
| 36 |
+
gr.Markdown("No data available for test split.")
|
| 37 |
+
with gr.Tab("Results: Validation Set") as validation_tab:
|
| 38 |
# 1. Load all necessary data for the "validation" split ONCE.
|
| 39 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
|
|
|
| 40 |
# Check if data was loaded successfully before trying to display it
|
| 41 |
if not validation_df.empty:
|
| 42 |
# 2. Render the display by calling the factory with the loaded data.
|
|
|
|
| 49 |
else:
|
| 50 |
gr.Markdown("No data available for validation split.")
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
with gr.Accordion("📙 Citation", open=False):
|
| 53 |
gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button-main", interactive=False)
|
| 54 |
|
| 55 |
|
| 56 |
# JavaScript to show the TEST nav, hide the VALIDATION nav, AND fix the plots.
|
| 57 |
+
show_validation_js = """
|
| 58 |
() => {setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);}
|
| 59 |
"""
|
| 60 |
# Assign the pure JS functions to the select events. No Python `fn` is needed.
|
| 61 |
+
validation_tab.select(fn=None, inputs=None, outputs=None, js=show_validation_js)
|
| 62 |
|
| 63 |
if __name__ == "__main__":
|
| 64 |
demo.launch()
|
ui_components.py
CHANGED
|
@@ -336,7 +336,7 @@ def create_leaderboard_display(
|
|
| 336 |
datatype=df_datatypes,
|
| 337 |
interactive=False,
|
| 338 |
wrap=True,
|
| 339 |
-
column_widths=[30, 30, 30,
|
| 340 |
elem_classes=["wrap-header-df"]
|
| 341 |
)
|
| 342 |
|
|
@@ -527,7 +527,14 @@ def create_benchmark_details_display(
|
|
| 527 |
df_datatypes.append("html")
|
| 528 |
else:
|
| 529 |
df_datatypes.append("str")
|
| 530 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 531 |
# Create the scatter plot using the full data for context, but plotting benchmark metrics
|
| 532 |
# This shows all agents on the same axis for better comparison.
|
| 533 |
benchmark_plot = _plot_scatter_plotly(
|
|
@@ -547,6 +554,7 @@ def create_benchmark_details_display(
|
|
| 547 |
datatype=df_datatypes,
|
| 548 |
interactive=False,
|
| 549 |
wrap=True,
|
|
|
|
| 550 |
elem_classes=["wrap-header-df"]
|
| 551 |
)
|
| 552 |
|
|
|
|
| 336 |
datatype=df_datatypes,
|
| 337 |
interactive=False,
|
| 338 |
wrap=True,
|
| 339 |
+
column_widths=[30, 30, 30, 200],
|
| 340 |
elem_classes=["wrap-header-df"]
|
| 341 |
)
|
| 342 |
|
|
|
|
| 527 |
df_datatypes.append("html")
|
| 528 |
else:
|
| 529 |
df_datatypes.append("str")
|
| 530 |
+
# Remove Pareto, Openness, and Agent Tooling from the headers
|
| 531 |
+
header_rename_map = {
|
| 532 |
+
"Pareto": "",
|
| 533 |
+
"Openness": "",
|
| 534 |
+
"Agent Tooling": ""
|
| 535 |
+
}
|
| 536 |
+
# 2. Create the final list of headers for display.
|
| 537 |
+
benchmark_table_df = benchmark_table_df.rename(columns=header_rename_map)
|
| 538 |
# Create the scatter plot using the full data for context, but plotting benchmark metrics
|
| 539 |
# This shows all agents on the same axis for better comparison.
|
| 540 |
benchmark_plot = _plot_scatter_plotly(
|
|
|
|
| 554 |
datatype=df_datatypes,
|
| 555 |
interactive=False,
|
| 556 |
wrap=True,
|
| 557 |
+
column_widths=[40, 40, 40, 350],
|
| 558 |
elem_classes=["wrap-header-df"]
|
| 559 |
)
|
| 560 |
|