sunmarinup commited on
Commit
9e60ee8
·
1 Parent(s): d81c02b

Colorize output

Browse files
app.py CHANGED
@@ -1,21 +1,18 @@
1
- from enum import Enum
2
- import io
3
  from datetime import datetime, timezone
4
 
5
  import gradio as gr
6
  import pandas as pd
7
 
8
- from src.about import TITLE, INTRODUCTION_TEXT
9
  from src.display.css_html_js import custom_css
 
10
  from src.leaderboard.input import load_csv_from_github
11
  from src.leaderboard.output import format_output_df
12
- from src.leaderboard.columns import DisplayColumns
13
-
14
 
15
  LEADERBOARD_GITHUB_URL = "https://github.com/upgini/mle-bench/blob/main/rankings/low/tabular/overall_ranks.csv"
16
 
17
 
18
- def download_leaderboard() -> pd.DataFrame:
19
  """Download the remote leaderboard CSV from GitHub (handles Git LFS).
20
 
21
  Returns a processed DataFrame ready for display.
@@ -35,7 +32,7 @@ def download_leaderboard() -> pd.DataFrame:
35
 
36
  def refresh_leaderboard():
37
  """Fetch the leaderboard and build the status message for the UI."""
38
- df = download_leaderboard()
39
  status = (
40
  f"Showing data from [GitHub]({LEADERBOARD_GITHUB_URL}). "
41
  f"Last refreshed: {datetime.now(timezone.utc):%Y-%m-%d %H:%M UTC}."
@@ -43,20 +40,45 @@ def refresh_leaderboard():
43
  return df, status
44
 
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  def create_app():
47
  """Create and configure the Gradio app without launching it."""
48
  with gr.Blocks(title="Upgini MLE-Bench Leaderboard", css=custom_css) as demo:
49
  gr.HTML(TITLE)
50
  gr.Markdown(INTRODUCTION_TEXT)
51
 
 
52
  leaderboard_table = gr.DataFrame(
53
  value=pd.DataFrame(columns=DisplayColumns.values()),
54
  wrap=True,
55
  interactive=False,
56
  type="pandas",
 
57
  label="Leaderboard",
58
  elem_id="leaderboard-table",
59
  )
 
60
  status_text = gr.Markdown()
61
  refresh_button = gr.Button("Refresh leaderboard", variant="primary")
62
 
 
 
 
1
  from datetime import datetime, timezone
2
 
3
  import gradio as gr
4
  import pandas as pd
5
 
6
+ from src.about import INTRODUCTION_TEXT, TITLE
7
  from src.display.css_html_js import custom_css
8
+ from src.leaderboard.columns import DisplayColumns, RequiredInputColumns
9
  from src.leaderboard.input import load_csv_from_github
10
  from src.leaderboard.output import format_output_df
 
 
11
 
12
  LEADERBOARD_GITHUB_URL = "https://github.com/upgini/mle-bench/blob/main/rankings/low/tabular/overall_ranks.csv"
13
 
14
 
15
+ def load_leaderboard() -> pd.DataFrame:
16
  """Download the remote leaderboard CSV from GitHub (handles Git LFS).
17
 
18
  Returns a processed DataFrame ready for display.
 
32
 
33
  def refresh_leaderboard():
34
  """Fetch the leaderboard and build the status message for the UI."""
35
+ df = apply_styling(load_leaderboard())
36
  status = (
37
  f"Showing data from [GitHub]({LEADERBOARD_GITHUB_URL}). "
38
  f"Last refreshed: {datetime.now(timezone.utc):%Y-%m-%d %H:%M UTC}."
 
40
  return df, status
41
 
42
 
43
+ def apply_styling(df: pd.DataFrame):
44
+ """Apply styling to the leaderboard table."""
45
+
46
+ display_df = df[DisplayColumns.values()]
47
+
48
+ style = display_df.style.background_gradient(
49
+ subset=[DisplayColumns.NORMALIZED_SCORE],
50
+ high=0.5,
51
+ low=0.0,
52
+ cmap="Greens",
53
+ gmap=df[RequiredInputColumns.MEAN_NORMALIZED_SCORE],
54
+ ).background_gradient(
55
+ subset=[DisplayColumns.ANY_MEDAL_SCORE],
56
+ high=1.2,
57
+ low=0.0,
58
+ cmap="Oranges",
59
+ gmap=df[RequiredInputColumns.MEAN_MEDAL_PCT],
60
+ )
61
+
62
+ return style
63
+
64
+
65
  def create_app():
66
  """Create and configure the Gradio app without launching it."""
67
  with gr.Blocks(title="Upgini MLE-Bench Leaderboard", css=custom_css) as demo:
68
  gr.HTML(TITLE)
69
  gr.Markdown(INTRODUCTION_TEXT)
70
 
71
+ # style = apply_styling(load_leaderboard())
72
  leaderboard_table = gr.DataFrame(
73
  value=pd.DataFrame(columns=DisplayColumns.values()),
74
  wrap=True,
75
  interactive=False,
76
  type="pandas",
77
+ datatype="markdown",
78
  label="Leaderboard",
79
  elem_id="leaderboard-table",
80
  )
81
+
82
  status_text = gr.Markdown()
83
  refresh_button = gr.Button("Refresh leaderboard", variant="primary")
84
 
src/display/css_html_js.py CHANGED
@@ -33,7 +33,7 @@ custom_css = """
33
  background: none;
34
  border: none;
35
  }
36
-
37
  #search-bar {
38
  padding: 0px;
39
  }
@@ -77,7 +77,7 @@ custom_css = """
77
  #filter_type label > .wrap{
78
  width: 103px;
79
  }
80
- #filter_type label > .wrap .wrap-inner{
81
  padding: 2px;
82
  }
83
  #filter_type label > .wrap .wrap-inner input{
@@ -94,6 +94,25 @@ custom_css = """
94
  #box-filter > .form{
95
  border: 0
96
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  """
98
 
99
  get_window_url_params = """
 
33
  background: none;
34
  border: none;
35
  }
36
+
37
  #search-bar {
38
  padding: 0px;
39
  }
 
77
  #filter_type label > .wrap{
78
  width: 103px;
79
  }
80
+ #filter_type label > .wrap .wrap-inner{
81
  padding: 2px;
82
  }
83
  #filter_type label > .wrap .wrap-inner input{
 
94
  #box-filter > .form{
95
  border: 0
96
  }
97
+
98
+ /* Support for HTML rendering in DataFrame cells */
99
+ #leaderboard-table table td {
100
+ white-space: normal !important;
101
+ }
102
+
103
+ #leaderboard-table table td div {
104
+ display: inline-block;
105
+ }
106
+
107
+ /* Ensure markdown links are clickable */
108
+ #leaderboard-table table td a {
109
+ color: #0066cc;
110
+ text-decoration: underline;
111
+ }
112
+
113
+ #leaderboard-table table td a:hover {
114
+ color: #004499;
115
+ }
116
  """
117
 
118
  get_window_url_params = """
src/display/formatting.py CHANGED
@@ -8,3 +8,9 @@ def styled_warning(warn):
8
 
9
  def styled_message(message):
10
  return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
 
 
 
 
 
 
 
8
 
9
  def styled_message(message):
10
  return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
11
+
12
+
13
+ def markdown_link(text: str | None, url: str | None) -> str:
14
+ if text is None or url is None:
15
+ return text
16
+ return f"[{text}]({url})"
src/leaderboard/output.py CHANGED
@@ -12,17 +12,28 @@ def format_output_df(df: pd.DataFrame) -> pd.DataFrame:
12
  # Create a new DataFrame with the display columns
13
  result_df = pd.DataFrame()
14
  result_df[DisplayColumns.EXPERIMENT_NAME] = df[RequiredInputColumns.EXPERIMENT_ID]
15
- result_df[DisplayColumns.AGENT] = df[RequiredInputColumns.AGENT]
 
 
 
 
16
  result_df[DisplayColumns.LLM_USED] = df[RequiredInputColumns.LLM_USED]
 
17
  result_df[DisplayColumns.NORMALIZED_SCORE] = (
18
  df[RequiredInputColumns.MEAN_NORMALIZED_SCORE].round(3).astype(str)
19
  + " ± "
20
  + df[RequiredInputColumns.STD_NORMALIZED_SCORE].round(3).astype(str)
21
  )
 
 
 
 
 
22
  result_df[DisplayColumns.ANY_MEDAL_SCORE] = (
23
  (df[RequiredInputColumns.MEAN_MEDAL_PCT] * 100).round(1).astype(str)
24
  + " ± "
25
  + (df[RequiredInputColumns.SEM_MEDAL_PCT] * 100).round(1).astype(str)
26
  )
 
27
  result_df[DisplayColumns.DATE] = df[RequiredInputColumns.DATE]
28
- return result_df[DisplayColumns.values()]
 
12
  # Create a new DataFrame with the display columns
13
  result_df = pd.DataFrame()
14
  result_df[DisplayColumns.EXPERIMENT_NAME] = df[RequiredInputColumns.EXPERIMENT_ID]
15
+
16
+ # Format Agent column as Markdown (ensure it's displayed properly)
17
+ result_df[DisplayColumns.AGENT] = df[RequiredInputColumns.AGENT].astype(str)
18
+
19
+ # Format LLM(s) used with HuggingFace links
20
  result_df[DisplayColumns.LLM_USED] = df[RequiredInputColumns.LLM_USED]
21
+
22
  result_df[DisplayColumns.NORMALIZED_SCORE] = (
23
  df[RequiredInputColumns.MEAN_NORMALIZED_SCORE].round(3).astype(str)
24
  + " ± "
25
  + df[RequiredInputColumns.STD_NORMALIZED_SCORE].round(3).astype(str)
26
  )
27
+
28
+ # Keep the numeric mean_normalized_score for gradient calculation
29
+ result_df[RequiredInputColumns.MEAN_NORMALIZED_SCORE] = df[RequiredInputColumns.MEAN_NORMALIZED_SCORE]
30
+ result_df[RequiredInputColumns.MEAN_MEDAL_PCT] = df[RequiredInputColumns.MEAN_MEDAL_PCT]
31
+
32
  result_df[DisplayColumns.ANY_MEDAL_SCORE] = (
33
  (df[RequiredInputColumns.MEAN_MEDAL_PCT] * 100).round(1).astype(str)
34
  + " ± "
35
  + (df[RequiredInputColumns.SEM_MEDAL_PCT] * 100).round(1).astype(str)
36
  )
37
+
38
  result_df[DisplayColumns.DATE] = df[RequiredInputColumns.DATE]
39
+ return result_df
tests/test_leaderboard.py CHANGED
@@ -4,7 +4,7 @@ import pandas as pd
4
  import pytest
5
  import requests
6
 
7
- from app import download_leaderboard, refresh_leaderboard
8
  from src.leaderboard.columns import DisplayColumns
9
 
10
 
@@ -49,7 +49,7 @@ class TestDownloadLeaderboard:
49
  mock_download.return_value = sample_csv_data
50
 
51
  # Execute
52
- df = download_leaderboard()
53
 
54
  # Assertions
55
  assert isinstance(df, pd.DataFrame)
@@ -62,7 +62,7 @@ class TestDownloadLeaderboard:
62
  """Test that numeric columns are properly formatted as mean ± std."""
63
  mock_download.return_value = sample_csv_data
64
 
65
- df = download_leaderboard()
66
 
67
  # Check that scores are formatted as strings with mean ± std
68
  # df is sorted by score descending: exp_003 (0.912), exp_001 (0.854), exp_002 (0.789)
@@ -77,7 +77,7 @@ class TestDownloadLeaderboard:
77
  """Test that medal percentages are converted from decimal to percentage and formatted."""
78
  mock_download.return_value = sample_csv_data
79
 
80
- df = download_leaderboard()
81
 
82
  # Check percentage conversion and formatting (0.876543 * 100 = 87.6543, rounded to 87.7)
83
  # df is sorted by score descending: exp_003 (92.3), exp_001 (87.7), exp_002 (76.5)
@@ -90,7 +90,7 @@ class TestDownloadLeaderboard:
90
  """Test that dates are properly formatted."""
91
  mock_download.return_value = sample_csv_data
92
 
93
- df = download_leaderboard()
94
 
95
  # Check date formatting - df sorted by score descending
96
  # exp_003 (2024-02-01), exp_001 (2024-01-15), exp_002 (2024-01-20)
@@ -103,7 +103,7 @@ class TestDownloadLeaderboard:
103
  """Test that df is sorted by mean_normalized_score descending."""
104
  mock_download.return_value = sample_csv_data
105
 
106
- df = download_leaderboard()
107
 
108
  # Check sorting (highest score first)
109
  # Extract numeric scores from formatted strings for comparison
@@ -117,7 +117,7 @@ class TestDownloadLeaderboard:
117
  """Test that extra columns are filtered out."""
118
  mock_download.return_value = sample_csv_with_extra_columns
119
 
120
- df = download_leaderboard()
121
 
122
  # Check that df is created correctly (extra columns should be filtered)
123
  assert len(df) == 2
@@ -131,7 +131,7 @@ class TestDownloadLeaderboard:
131
  mock_download.return_value = sample_csv_missing_columns
132
 
133
  with pytest.raises(ValueError, match="Leaderboard is missing expected columns"):
134
- download_leaderboard()
135
 
136
  @patch("src.leaderboard.input.download_github_file_content")
137
  def test_http_error(self, mock_download):
@@ -139,7 +139,7 @@ class TestDownloadLeaderboard:
139
  mock_download.side_effect = requests.HTTPError("404 Not Found")
140
 
141
  with pytest.raises(requests.HTTPError):
142
- download_leaderboard()
143
 
144
  @patch("src.leaderboard.input.download_github_file_content")
145
  def test_network_error(self, mock_download):
@@ -147,7 +147,7 @@ class TestDownloadLeaderboard:
147
  mock_download.side_effect = requests.ConnectionError("Connection failed")
148
 
149
  with pytest.raises(requests.ConnectionError):
150
- download_leaderboard()
151
 
152
  @patch("src.leaderboard.input.download_github_file_content")
153
  def test_timeout_handling(self, mock_download):
@@ -159,7 +159,7 @@ class TestDownloadLeaderboard:
159
  )
160
  mock_download.return_value = csv_data
161
 
162
- download_leaderboard()
163
 
164
  # Verify timeout was passed to download_github_file_content
165
  mock_download.assert_called_once()
@@ -176,7 +176,7 @@ class TestDownloadLeaderboard:
176
  )
177
  mock_download.return_value = csv_data
178
 
179
- df = download_leaderboard()
180
 
181
  assert isinstance(df, pd.DataFrame)
182
  assert len(df) == 0
@@ -193,7 +193,7 @@ class TestDownloadLeaderboard:
193
  )
194
  mock_download.return_value = csv_with_invalid_date
195
 
196
- df = download_leaderboard()
197
 
198
  # Invalid dates should become NaT and then "nan" string
199
  # Find rows by Experiment Name since order may vary
@@ -208,7 +208,7 @@ class TestDownloadLeaderboard:
208
  # The utility function handles LFS internally, so we just return the content
209
  mock_download.return_value = sample_csv_data
210
 
211
- df = download_leaderboard()
212
 
213
  # Should successfully download via download_url
214
  assert isinstance(df, pd.DataFrame)
@@ -222,7 +222,7 @@ class TestDownloadLeaderboard:
222
  # The utility function handles download_url internally, so we just return the content
223
  mock_download.return_value = sample_csv_data
224
 
225
- df = download_leaderboard()
226
 
227
  assert isinstance(df, pd.DataFrame)
228
  assert len(df) == 3
 
4
  import pytest
5
  import requests
6
 
7
+ from app import load_leaderboard, refresh_leaderboard
8
  from src.leaderboard.columns import DisplayColumns
9
 
10
 
 
49
  mock_download.return_value = sample_csv_data
50
 
51
  # Execute
52
+ df = load_leaderboard()
53
 
54
  # Assertions
55
  assert isinstance(df, pd.DataFrame)
 
62
  """Test that numeric columns are properly formatted as mean ± std."""
63
  mock_download.return_value = sample_csv_data
64
 
65
+ df = load_leaderboard()
66
 
67
  # Check that scores are formatted as strings with mean ± std
68
  # df is sorted by score descending: exp_003 (0.912), exp_001 (0.854), exp_002 (0.789)
 
77
  """Test that medal percentages are converted from decimal to percentage and formatted."""
78
  mock_download.return_value = sample_csv_data
79
 
80
+ df = load_leaderboard()
81
 
82
  # Check percentage conversion and formatting (0.876543 * 100 = 87.6543, rounded to 87.7)
83
  # df is sorted by score descending: exp_003 (92.3), exp_001 (87.7), exp_002 (76.5)
 
90
  """Test that dates are properly formatted."""
91
  mock_download.return_value = sample_csv_data
92
 
93
+ df = load_leaderboard()
94
 
95
  # Check date formatting - df sorted by score descending
96
  # exp_003 (2024-02-01), exp_001 (2024-01-15), exp_002 (2024-01-20)
 
103
  """Test that df is sorted by mean_normalized_score descending."""
104
  mock_download.return_value = sample_csv_data
105
 
106
+ df = load_leaderboard()
107
 
108
  # Check sorting (highest score first)
109
  # Extract numeric scores from formatted strings for comparison
 
117
  """Test that extra columns are filtered out."""
118
  mock_download.return_value = sample_csv_with_extra_columns
119
 
120
+ df = load_leaderboard()
121
 
122
  # Check that df is created correctly (extra columns should be filtered)
123
  assert len(df) == 2
 
131
  mock_download.return_value = sample_csv_missing_columns
132
 
133
  with pytest.raises(ValueError, match="Leaderboard is missing expected columns"):
134
+ load_leaderboard()
135
 
136
  @patch("src.leaderboard.input.download_github_file_content")
137
  def test_http_error(self, mock_download):
 
139
  mock_download.side_effect = requests.HTTPError("404 Not Found")
140
 
141
  with pytest.raises(requests.HTTPError):
142
+ load_leaderboard()
143
 
144
  @patch("src.leaderboard.input.download_github_file_content")
145
  def test_network_error(self, mock_download):
 
147
  mock_download.side_effect = requests.ConnectionError("Connection failed")
148
 
149
  with pytest.raises(requests.ConnectionError):
150
+ load_leaderboard()
151
 
152
  @patch("src.leaderboard.input.download_github_file_content")
153
  def test_timeout_handling(self, mock_download):
 
159
  )
160
  mock_download.return_value = csv_data
161
 
162
+ load_leaderboard()
163
 
164
  # Verify timeout was passed to download_github_file_content
165
  mock_download.assert_called_once()
 
176
  )
177
  mock_download.return_value = csv_data
178
 
179
+ df = load_leaderboard()
180
 
181
  assert isinstance(df, pd.DataFrame)
182
  assert len(df) == 0
 
193
  )
194
  mock_download.return_value = csv_with_invalid_date
195
 
196
+ df = load_leaderboard()
197
 
198
  # Invalid dates should become NaT and then "nan" string
199
  # Find rows by Experiment Name since order may vary
 
208
  # The utility function handles LFS internally, so we just return the content
209
  mock_download.return_value = sample_csv_data
210
 
211
+ df = load_leaderboard()
212
 
213
  # Should successfully download via download_url
214
  assert isinstance(df, pd.DataFrame)
 
222
  # The utility function handles download_url internally, so we just return the content
223
  mock_download.return_value = sample_csv_data
224
 
225
+ df = load_leaderboard()
226
 
227
  assert isinstance(df, pd.DataFrame)
228
  assert len(df) == 3