Files changed (3) hide show
  1. app.py +3 -1
  2. src/leaderboard/columns.py +2 -2
  3. tests/test_leaderboard.py +16 -38
app.py CHANGED
@@ -43,6 +43,9 @@ def refresh_leaderboard():
43
  def apply_styling(df: pd.DataFrame):
44
  """Apply styling to the leaderboard table."""
45
 
 
 
 
46
  display_df = df[DisplayColumns.values()]
47
 
48
  style = (
@@ -78,7 +81,6 @@ def create_app():
78
  gr.HTML(TITLE)
79
  gr.Markdown(INTRODUCTION_TEXT)
80
 
81
- # style = apply_styling(load_leaderboard())
82
  leaderboard_table = gr.DataFrame(
83
  value=pd.DataFrame(columns=DisplayColumns.values()),
84
  wrap=True,
 
43
  def apply_styling(df: pd.DataFrame):
44
  """Apply styling to the leaderboard table."""
45
 
46
+ if df.empty:
47
+ return pd.DataFrame(columns=DisplayColumns.values())
48
+
49
  display_df = df[DisplayColumns.values()]
50
 
51
  style = (
 
81
  gr.HTML(TITLE)
82
  gr.Markdown(INTRODUCTION_TEXT)
83
 
 
84
  leaderboard_table = gr.DataFrame(
85
  value=pd.DataFrame(columns=DisplayColumns.values()),
86
  wrap=True,
src/leaderboard/columns.py CHANGED
@@ -26,8 +26,8 @@ class DisplayColumns:
26
  EXPERIMENT_NAME = "Experiment Name"
27
  AGENT = "Agent"
28
  LLM_USED = "LLM(s) used"
29
- NORMALIZED_SCORE = "Normalized Score"
30
- ANY_MEDAL_SCORE = "Any Medal % Score"
31
  DATE = "Date"
32
 
33
  @staticmethod
 
26
  EXPERIMENT_NAME = "Experiment Name"
27
  AGENT = "Agent"
28
  LLM_USED = "LLM(s) used"
29
+ NORMALIZED_SCORE = "Normalized Score / Quality"
30
+ ANY_MEDAL_SCORE = "Any Medal % / Autonomy"
31
  DATE = "Date"
32
 
33
  @staticmethod
tests/test_leaderboard.py CHANGED
@@ -5,7 +5,7 @@ import pytest
5
  import requests
6
 
7
  from app import load_leaderboard, refresh_leaderboard
8
- from src.leaderboard.columns import DisplayColumns
9
 
10
 
11
  @pytest.fixture
@@ -54,7 +54,8 @@ class TestDownloadLeaderboard:
54
  # Assertions
55
  assert isinstance(df, pd.DataFrame)
56
  assert len(df) == 3
57
- assert list(df.columns) == DisplayColumns.values()
 
58
  mock_download.assert_called_once()
59
 
60
  @patch("src.leaderboard.input.download_github_file_content")
@@ -66,11 +67,11 @@ class TestDownloadLeaderboard:
66
 
67
  # Check that scores are formatted as strings with mean ± std
68
  # df is sorted by score descending: exp_003 (0.912), exp_001 (0.854), exp_002 (0.789)
69
- assert df.iloc[0]["Normalized Score"] == "0.912 ± 0.009"
70
- assert df.iloc[1]["Normalized Score"] == "0.854 ± 0.012"
71
- assert df.iloc[2]["Normalized Score"] == "0.789 ± 0.023"
72
  # Check that scores are strings
73
- assert isinstance(df.iloc[0]["Normalized Score"], str)
74
 
75
  @patch("src.leaderboard.input.download_github_file_content")
76
  def test_percentage_conversion(self, mock_download, sample_csv_data):
@@ -121,7 +122,9 @@ class TestDownloadLeaderboard:
121
 
122
  # Check that df is created correctly (extra columns should be filtered)
123
  assert len(df) == 2
124
- assert list(df.columns) == DisplayColumns.values()
 
 
125
  # Verify the df doesn't have extra columns
126
  assert "extra_col" not in df.columns
127
 
@@ -202,38 +205,11 @@ class TestDownloadLeaderboard:
202
  assert pd.isna(row_001[DisplayColumns.DATE])
203
  assert row_002[DisplayColumns.DATE] == "2024-01-20"
204
 
205
- @patch("src.leaderboard.input.download_github_file_content")
206
- def test_git_lfs_pointer_file(self, mock_download, sample_csv_data):
207
- """Test handling of Git LFS pointer files."""
208
- # The utility function handles LFS internally, so we just return the content
209
- mock_download.return_value = sample_csv_data
210
-
211
- df = load_leaderboard()
212
-
213
- # Should successfully download via download_url
214
- assert isinstance(df, pd.DataFrame)
215
- assert len(df) == 3
216
- assert list(df.columns) == DisplayColumns.values()
217
- mock_download.assert_called_once()
218
-
219
- @patch("src.leaderboard.input.download_github_file_content")
220
- def test_large_file_download_url(self, mock_download, sample_csv_data):
221
- """Test handling of large files that only have download_url."""
222
- # The utility function handles download_url internally, so we just return the content
223
- mock_download.return_value = sample_csv_data
224
-
225
- df = load_leaderboard()
226
-
227
- assert isinstance(df, pd.DataFrame)
228
- assert len(df) == 3
229
- assert list(df.columns) == DisplayColumns.values()
230
- mock_download.assert_called_once()
231
-
232
 
233
  class TestRefreshLeaderboard:
234
  """Tests for refresh_leaderboard function."""
235
 
236
- @patch("app.download_leaderboard")
237
  def test_refresh_leaderboard_success(self, mock_download):
238
  """Test that refresh_leaderboard returns dataframe and status message."""
239
  # Setup mocks
@@ -242,7 +218,9 @@ class TestRefreshLeaderboard:
242
  DisplayColumns.EXPERIMENT_NAME: ["exp_001"],
243
  DisplayColumns.AGENT: ["Agent A"],
244
  DisplayColumns.LLM_USED: ["GPT-4"],
 
245
  DisplayColumns.NORMALIZED_SCORE: ["0.850 ± 0.010"],
 
246
  DisplayColumns.ANY_MEDAL_SCORE: ["85.0 ± 1.0"],
247
  DisplayColumns.DATE: ["2024-01-15"],
248
  }
@@ -253,7 +231,7 @@ class TestRefreshLeaderboard:
253
  df, status = refresh_leaderboard()
254
 
255
  # Assertions
256
- assert isinstance(df, pd.DataFrame)
257
  assert "Showing data from" in status
258
  assert "GitHub" in status
259
  # Check that status contains timestamp in expected format (YYYY-MM-DD HH:MM UTC)
@@ -266,7 +244,7 @@ class TestRefreshLeaderboard:
266
  assert re.search(timestamp_pattern, status) is not None
267
  mock_download.assert_called_once()
268
 
269
- @patch("app.download_leaderboard")
270
  def test_refresh_leaderboard_includes_url(self, mock_download):
271
  """Test that status message includes the GitHub URL."""
272
  mock_df = pd.DataFrame()
@@ -277,7 +255,7 @@ class TestRefreshLeaderboard:
277
  assert "github.com" in status.lower() or "GitHub" in status
278
  assert "upgini/mle-bench" in status
279
 
280
- @patch("app.download_leaderboard")
281
  def test_refresh_leaderboard_propagates_error(self, mock_download):
282
  """Test that errors from download_leaderboard are propagated."""
283
  mock_download.side_effect = requests.HTTPError("404 Not Found")
 
5
  import requests
6
 
7
  from app import load_leaderboard, refresh_leaderboard
8
+ from src.leaderboard.columns import DisplayColumns, RequiredInputColumns
9
 
10
 
11
  @pytest.fixture
 
54
  # Assertions
55
  assert isinstance(df, pd.DataFrame)
56
  assert len(df) == 3
57
+
58
+ assert all(col in df.columns for col in DisplayColumns.values())
59
  mock_download.assert_called_once()
60
 
61
  @patch("src.leaderboard.input.download_github_file_content")
 
67
 
68
  # Check that scores are formatted as strings with mean ± std
69
  # df is sorted by score descending: exp_003 (0.912), exp_001 (0.854), exp_002 (0.789)
70
+ assert df.iloc[0][DisplayColumns.NORMALIZED_SCORE] == "0.912 ± 0.009"
71
+ assert df.iloc[1][DisplayColumns.NORMALIZED_SCORE] == "0.854 ± 0.012"
72
+ assert df.iloc[2][DisplayColumns.NORMALIZED_SCORE] == "0.789 ± 0.023"
73
  # Check that scores are strings
74
+ assert isinstance(df.iloc[0][DisplayColumns.NORMALIZED_SCORE], str)
75
 
76
  @patch("src.leaderboard.input.download_github_file_content")
77
  def test_percentage_conversion(self, mock_download, sample_csv_data):
 
122
 
123
  # Check that df is created correctly (extra columns should be filtered)
124
  assert len(df) == 2
125
+ assert set(df.columns) == set(
126
+ DisplayColumns.values() + [RequiredInputColumns.MEAN_NORMALIZED_SCORE, RequiredInputColumns.MEAN_MEDAL_PCT]
127
+ )
128
  # Verify the df doesn't have extra columns
129
  assert "extra_col" not in df.columns
130
 
 
205
  assert pd.isna(row_001[DisplayColumns.DATE])
206
  assert row_002[DisplayColumns.DATE] == "2024-01-20"
207
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
 
209
  class TestRefreshLeaderboard:
210
  """Tests for refresh_leaderboard function."""
211
 
212
+ @patch("app.load_leaderboard")
213
  def test_refresh_leaderboard_success(self, mock_download):
214
  """Test that refresh_leaderboard returns dataframe and status message."""
215
  # Setup mocks
 
218
  DisplayColumns.EXPERIMENT_NAME: ["exp_001"],
219
  DisplayColumns.AGENT: ["Agent A"],
220
  DisplayColumns.LLM_USED: ["GPT-4"],
221
+ RequiredInputColumns.MEAN_NORMALIZED_SCORE: [0.850],
222
  DisplayColumns.NORMALIZED_SCORE: ["0.850 ± 0.010"],
223
+ RequiredInputColumns.MEAN_MEDAL_PCT: [0.850],
224
  DisplayColumns.ANY_MEDAL_SCORE: ["85.0 ± 1.0"],
225
  DisplayColumns.DATE: ["2024-01-15"],
226
  }
 
231
  df, status = refresh_leaderboard()
232
 
233
  # Assertions
234
+ assert df is not None
235
  assert "Showing data from" in status
236
  assert "GitHub" in status
237
  # Check that status contains timestamp in expected format (YYYY-MM-DD HH:MM UTC)
 
244
  assert re.search(timestamp_pattern, status) is not None
245
  mock_download.assert_called_once()
246
 
247
+ @patch("app.load_leaderboard")
248
  def test_refresh_leaderboard_includes_url(self, mock_download):
249
  """Test that status message includes the GitHub URL."""
250
  mock_df = pd.DataFrame()
 
255
  assert "github.com" in status.lower() or "GitHub" in status
256
  assert "upgini/mle-bench" in status
257
 
258
+ @patch("app.load_leaderboard")
259
  def test_refresh_leaderboard_propagates_error(self, mock_download):
260
  """Test that errors from download_leaderboard are propagated."""
261
  mock_download.side_effect = requests.HTTPError("404 Not Found")