openhands openhands commited on
Commit
b8aea20
·
1 Parent(s): 6a5c447

Remove Test Set/Validation Set tabs, keep single results view

Browse files

Simplified the UI by removing the dual tab structure (Test Set / Validation Set)
and keeping only a single results display using the test set data.

Co-authored-by: openhands <openhands@all-hands.dev>

Files changed (2) hide show
  1. category_page_builder.py +17 -67
  2. main_page.py +10 -40
category_page_builder.py CHANGED
@@ -13,15 +13,11 @@ CATEGORY_DIAGRAM_MAP = {
13
 
14
  def build_category_page(CATEGORY_NAME, PAGE_DESCRIPTION):
15
  with gr.Column(elem_id="page-content-wrapper"):
16
- validation_df, validation_tag_map = get_full_leaderboard_data("validation")
17
  test_df, test_tag_map = get_full_leaderboard_data("test")
18
  with gr.Row(elem_id="intro-row"):
19
 
20
  with gr.Column(scale=1):
21
  gr.HTML(f'<h2>OpenHands Index {CATEGORY_NAME} Leaderboard <span style="font-weight: normal; color: inherit;">(Aggregate)</span></h2>', elem_id="main-header")
22
- with gr.Column(elem_id="validation_nav_container", visible=False) as validation_nav_container:
23
- create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME, validation=True)
24
-
25
  with gr.Column(elem_id="test_nav_container", visible=True) as test_nav_container:
26
  create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
27
 
@@ -41,67 +37,21 @@ def build_category_page(CATEGORY_NAME, PAGE_DESCRIPTION):
41
  interactive=False,
42
  elem_id="diagram-image"
43
  )
44
- # --- This page now has two main sections: Validation and Test ---
45
- with gr.Tabs():
46
- with gr.Tab("Results: Test Set") as test_tab:
47
- # Repeat the process for the "test" split
48
- if not test_df.empty:
49
- gr.Markdown("**Test Set** results are reserved for final assessment. This helps ensure that the agent generalizes well to unseen problems.")
50
- create_leaderboard_display(
51
- full_df=test_df,
52
- tag_map=test_tag_map,
53
- category_name=CATEGORY_NAME,
54
- split_name="test"
55
- )
56
- create_benchmark_details_display(
57
- full_df=test_df,
58
- tag_map=test_tag_map,
59
- category_name=CATEGORY_NAME,
60
- validation=False,
61
- )
62
- else:
63
- gr.Markdown("No data available for test split.")
64
- with gr.Tab("Results: Validation Set") as validation_tab:
65
- # 1. Load all necessary data for the "validation" split ONCE.
66
- if not validation_df.empty:
67
- gr.Markdown("**Validation Set** results are used during development to tune and compare agents before final testing.")
68
- # 2. Render the main category display using the loaded data.
69
- create_leaderboard_display(
70
- full_df=validation_df,
71
- tag_map=validation_tag_map,
72
- category_name=CATEGORY_NAME,
73
- split_name="validation"
74
- )
75
-
76
- # 3. Render the detailed breakdown for each benchmark in the category.
77
- create_benchmark_details_display(
78
- full_df=validation_df,
79
- tag_map=validation_tag_map,
80
- category_name=CATEGORY_NAME,
81
- validation=True,
82
- )
83
- else:
84
- gr.Markdown("No data available for validation split.")
85
-
86
-
87
- show_validation_js = """
88
- () => {
89
- document.getElementById('validation_nav_container').style.display = 'block';
90
- document.getElementById('test_nav_container').style.display = 'none';
91
- setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);
92
- }
93
- """
94
-
95
- # JavaScript to show the TEST nav, hide the VALIDATION nav, AND fix the plots.
96
- show_test_js = """
97
- () => {
98
- document.getElementById('validation_nav_container').style.display = 'none';
99
- document.getElementById('test_nav_container').style.display = 'block';
100
- }
101
- """
102
-
103
- # Assign the pure JS functions to the select events. No Python `fn` is needed.
104
- validation_tab.select(fn=None, inputs=None, outputs=None, js=show_validation_js)
105
- test_tab.select(fn=None, inputs=None, outputs=None, js=show_test_js)
106
 
107
- return validation_nav_container, test_nav_container
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  def build_category_page(CATEGORY_NAME, PAGE_DESCRIPTION):
15
  with gr.Column(elem_id="page-content-wrapper"):
 
16
  test_df, test_tag_map = get_full_leaderboard_data("test")
17
  with gr.Row(elem_id="intro-row"):
18
 
19
  with gr.Column(scale=1):
20
  gr.HTML(f'<h2>OpenHands Index {CATEGORY_NAME} Leaderboard <span style="font-weight: normal; color: inherit;">(Aggregate)</span></h2>', elem_id="main-header")
 
 
 
21
  with gr.Column(elem_id="test_nav_container", visible=True) as test_nav_container:
22
  create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
23
 
 
37
  interactive=False,
38
  elem_id="diagram-image"
39
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ if not test_df.empty:
42
+ create_leaderboard_display(
43
+ full_df=test_df,
44
+ tag_map=test_tag_map,
45
+ category_name=CATEGORY_NAME,
46
+ split_name="test"
47
+ )
48
+ create_benchmark_details_display(
49
+ full_df=test_df,
50
+ tag_map=test_tag_map,
51
+ category_name=CATEGORY_NAME,
52
+ validation=False,
53
+ )
54
+ else:
55
+ gr.Markdown("No data available.")
56
+
57
+ return test_nav_container
main_page.py CHANGED
@@ -36,46 +36,16 @@ def build_page():
36
  CATEGORY_NAME = "Overall"
37
  gr.HTML(f'<h2>OpenHands Index {CATEGORY_NAME} Leaderboard <span style="font-weight: normal; color: inherit;">(Aggregate)</span></h2>', elem_id="main-header")
38
 
39
- with gr.Tabs() as tabs:
40
- with gr.Tab("Results: Test Set") as test_tab:
41
- test_df, test_tag_map = get_full_leaderboard_data("test")
42
- if not test_df.empty:
43
- gr.Markdown("**Test Set** results are reserved for final assessment. This helps ensure that the agent generalizes well to unseen problems.")
44
- create_leaderboard_display(
45
- full_df=test_df,
46
- tag_map=test_tag_map,
47
- category_name=CATEGORY_NAME, # Use our constant
48
- split_name="test"
49
- )
50
- else:
51
- gr.Markdown("No data available for test split.")
52
- with gr.Tab("Results: Validation Set") as validation_tab:
53
- # 1. Load all necessary data for the "validation" split ONCE.
54
- validation_df, validation_tag_map = get_full_leaderboard_data("validation")
55
- # Check if data was loaded successfully before trying to display it
56
- if not validation_df.empty:
57
- gr.Markdown("**Validation Set** results are used during development to tune and compare agents before final testing.")
58
- # 2. Render the display by calling the factory with the loaded data.
59
- create_leaderboard_display(
60
- full_df=validation_df,
61
- tag_map=validation_tag_map,
62
- category_name=CATEGORY_NAME, # Use our constant
63
- split_name="validation"
64
- )
65
- else:
66
- gr.Markdown("No data available for validation split.")
67
-
68
- # hiding this for now till we have the real paper data
69
- # with gr.Accordion("📙 Citation", open=False):
70
- # gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button-main", interactive=False)
71
-
72
-
73
- # JavaScript to show the TEST nav, hide the VALIDATION nav, AND fix the plots.
74
- show_validation_js = """
75
- () => {setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);}
76
- """
77
- # Assign the pure JS functions to the select events. No Python `fn` is needed.
78
- validation_tab.select(fn=None, inputs=None, outputs=None, js=show_validation_js)
79
 
80
  if __name__ == "__main__":
81
  demo.launch()
 
36
  CATEGORY_NAME = "Overall"
37
  gr.HTML(f'<h2>OpenHands Index {CATEGORY_NAME} Leaderboard <span style="font-weight: normal; color: inherit;">(Aggregate)</span></h2>', elem_id="main-header")
38
 
39
+ test_df, test_tag_map = get_full_leaderboard_data("test")
40
+ if not test_df.empty:
41
+ create_leaderboard_display(
42
+ full_df=test_df,
43
+ tag_map=test_tag_map,
44
+ category_name=CATEGORY_NAME,
45
+ split_name="test"
46
+ )
47
+ else:
48
+ gr.Markdown("No data available.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  if __name__ == "__main__":
51
  demo.launch()