Spaces:
Running
Running
Update table view
#2
by Sunmarinup - opened
- app.py +3 -1
- src/leaderboard/columns.py +2 -2
- tests/test_leaderboard.py +16 -38
app.py
CHANGED
|
@@ -43,6 +43,9 @@ def refresh_leaderboard():
|
|
| 43 |
def apply_styling(df: pd.DataFrame):
|
| 44 |
"""Apply styling to the leaderboard table."""
|
| 45 |
|
|
|
|
|
|
|
|
|
|
| 46 |
display_df = df[DisplayColumns.values()]
|
| 47 |
|
| 48 |
style = (
|
|
@@ -78,7 +81,6 @@ def create_app():
|
|
| 78 |
gr.HTML(TITLE)
|
| 79 |
gr.Markdown(INTRODUCTION_TEXT)
|
| 80 |
|
| 81 |
-
# style = apply_styling(load_leaderboard())
|
| 82 |
leaderboard_table = gr.DataFrame(
|
| 83 |
value=pd.DataFrame(columns=DisplayColumns.values()),
|
| 84 |
wrap=True,
|
|
|
|
| 43 |
def apply_styling(df: pd.DataFrame):
|
| 44 |
"""Apply styling to the leaderboard table."""
|
| 45 |
|
| 46 |
+
if df.empty:
|
| 47 |
+
return pd.DataFrame(columns=DisplayColumns.values())
|
| 48 |
+
|
| 49 |
display_df = df[DisplayColumns.values()]
|
| 50 |
|
| 51 |
style = (
|
|
|
|
| 81 |
gr.HTML(TITLE)
|
| 82 |
gr.Markdown(INTRODUCTION_TEXT)
|
| 83 |
|
|
|
|
| 84 |
leaderboard_table = gr.DataFrame(
|
| 85 |
value=pd.DataFrame(columns=DisplayColumns.values()),
|
| 86 |
wrap=True,
|
src/leaderboard/columns.py
CHANGED
|
@@ -26,8 +26,8 @@ class DisplayColumns:
|
|
| 26 |
EXPERIMENT_NAME = "Experiment Name"
|
| 27 |
AGENT = "Agent"
|
| 28 |
LLM_USED = "LLM(s) used"
|
| 29 |
-
NORMALIZED_SCORE = "Normalized Score"
|
| 30 |
-
ANY_MEDAL_SCORE = "Any Medal %
|
| 31 |
DATE = "Date"
|
| 32 |
|
| 33 |
@staticmethod
|
|
|
|
| 26 |
EXPERIMENT_NAME = "Experiment Name"
|
| 27 |
AGENT = "Agent"
|
| 28 |
LLM_USED = "LLM(s) used"
|
| 29 |
+
NORMALIZED_SCORE = "Normalized Score / Quality"
|
| 30 |
+
ANY_MEDAL_SCORE = "Any Medal % / Autonomy"
|
| 31 |
DATE = "Date"
|
| 32 |
|
| 33 |
@staticmethod
|
tests/test_leaderboard.py
CHANGED
|
@@ -5,7 +5,7 @@ import pytest
|
|
| 5 |
import requests
|
| 6 |
|
| 7 |
from app import load_leaderboard, refresh_leaderboard
|
| 8 |
-
from src.leaderboard.columns import DisplayColumns
|
| 9 |
|
| 10 |
|
| 11 |
@pytest.fixture
|
|
@@ -54,7 +54,8 @@ class TestDownloadLeaderboard:
|
|
| 54 |
# Assertions
|
| 55 |
assert isinstance(df, pd.DataFrame)
|
| 56 |
assert len(df) == 3
|
| 57 |
-
|
|
|
|
| 58 |
mock_download.assert_called_once()
|
| 59 |
|
| 60 |
@patch("src.leaderboard.input.download_github_file_content")
|
|
@@ -66,11 +67,11 @@ class TestDownloadLeaderboard:
|
|
| 66 |
|
| 67 |
# Check that scores are formatted as strings with mean ± std
|
| 68 |
# df is sorted by score descending: exp_003 (0.912), exp_001 (0.854), exp_002 (0.789)
|
| 69 |
-
assert df.iloc[0][
|
| 70 |
-
assert df.iloc[1][
|
| 71 |
-
assert df.iloc[2][
|
| 72 |
# Check that scores are strings
|
| 73 |
-
assert isinstance(df.iloc[0][
|
| 74 |
|
| 75 |
@patch("src.leaderboard.input.download_github_file_content")
|
| 76 |
def test_percentage_conversion(self, mock_download, sample_csv_data):
|
|
@@ -121,7 +122,9 @@ class TestDownloadLeaderboard:
|
|
| 121 |
|
| 122 |
# Check that df is created correctly (extra columns should be filtered)
|
| 123 |
assert len(df) == 2
|
| 124 |
-
assert
|
|
|
|
|
|
|
| 125 |
# Verify the df doesn't have extra columns
|
| 126 |
assert "extra_col" not in df.columns
|
| 127 |
|
|
@@ -202,38 +205,11 @@ class TestDownloadLeaderboard:
|
|
| 202 |
assert pd.isna(row_001[DisplayColumns.DATE])
|
| 203 |
assert row_002[DisplayColumns.DATE] == "2024-01-20"
|
| 204 |
|
| 205 |
-
@patch("src.leaderboard.input.download_github_file_content")
|
| 206 |
-
def test_git_lfs_pointer_file(self, mock_download, sample_csv_data):
|
| 207 |
-
"""Test handling of Git LFS pointer files."""
|
| 208 |
-
# The utility function handles LFS internally, so we just return the content
|
| 209 |
-
mock_download.return_value = sample_csv_data
|
| 210 |
-
|
| 211 |
-
df = load_leaderboard()
|
| 212 |
-
|
| 213 |
-
# Should successfully download via download_url
|
| 214 |
-
assert isinstance(df, pd.DataFrame)
|
| 215 |
-
assert len(df) == 3
|
| 216 |
-
assert list(df.columns) == DisplayColumns.values()
|
| 217 |
-
mock_download.assert_called_once()
|
| 218 |
-
|
| 219 |
-
@patch("src.leaderboard.input.download_github_file_content")
|
| 220 |
-
def test_large_file_download_url(self, mock_download, sample_csv_data):
|
| 221 |
-
"""Test handling of large files that only have download_url."""
|
| 222 |
-
# The utility function handles download_url internally, so we just return the content
|
| 223 |
-
mock_download.return_value = sample_csv_data
|
| 224 |
-
|
| 225 |
-
df = load_leaderboard()
|
| 226 |
-
|
| 227 |
-
assert isinstance(df, pd.DataFrame)
|
| 228 |
-
assert len(df) == 3
|
| 229 |
-
assert list(df.columns) == DisplayColumns.values()
|
| 230 |
-
mock_download.assert_called_once()
|
| 231 |
-
|
| 232 |
|
| 233 |
class TestRefreshLeaderboard:
|
| 234 |
"""Tests for refresh_leaderboard function."""
|
| 235 |
|
| 236 |
-
@patch("app.
|
| 237 |
def test_refresh_leaderboard_success(self, mock_download):
|
| 238 |
"""Test that refresh_leaderboard returns dataframe and status message."""
|
| 239 |
# Setup mocks
|
|
@@ -242,7 +218,9 @@ class TestRefreshLeaderboard:
|
|
| 242 |
DisplayColumns.EXPERIMENT_NAME: ["exp_001"],
|
| 243 |
DisplayColumns.AGENT: ["Agent A"],
|
| 244 |
DisplayColumns.LLM_USED: ["GPT-4"],
|
|
|
|
| 245 |
DisplayColumns.NORMALIZED_SCORE: ["0.850 ± 0.010"],
|
|
|
|
| 246 |
DisplayColumns.ANY_MEDAL_SCORE: ["85.0 ± 1.0"],
|
| 247 |
DisplayColumns.DATE: ["2024-01-15"],
|
| 248 |
}
|
|
@@ -253,7 +231,7 @@ class TestRefreshLeaderboard:
|
|
| 253 |
df, status = refresh_leaderboard()
|
| 254 |
|
| 255 |
# Assertions
|
| 256 |
-
assert
|
| 257 |
assert "Showing data from" in status
|
| 258 |
assert "GitHub" in status
|
| 259 |
# Check that status contains timestamp in expected format (YYYY-MM-DD HH:MM UTC)
|
|
@@ -266,7 +244,7 @@ class TestRefreshLeaderboard:
|
|
| 266 |
assert re.search(timestamp_pattern, status) is not None
|
| 267 |
mock_download.assert_called_once()
|
| 268 |
|
| 269 |
-
@patch("app.
|
| 270 |
def test_refresh_leaderboard_includes_url(self, mock_download):
|
| 271 |
"""Test that status message includes the GitHub URL."""
|
| 272 |
mock_df = pd.DataFrame()
|
|
@@ -277,7 +255,7 @@ class TestRefreshLeaderboard:
|
|
| 277 |
assert "github.com" in status.lower() or "GitHub" in status
|
| 278 |
assert "upgini/mle-bench" in status
|
| 279 |
|
| 280 |
-
@patch("app.
|
| 281 |
def test_refresh_leaderboard_propagates_error(self, mock_download):
|
| 282 |
"""Test that errors from download_leaderboard are propagated."""
|
| 283 |
mock_download.side_effect = requests.HTTPError("404 Not Found")
|
|
|
|
| 5 |
import requests
|
| 6 |
|
| 7 |
from app import load_leaderboard, refresh_leaderboard
|
| 8 |
+
from src.leaderboard.columns import DisplayColumns, RequiredInputColumns
|
| 9 |
|
| 10 |
|
| 11 |
@pytest.fixture
|
|
|
|
| 54 |
# Assertions
|
| 55 |
assert isinstance(df, pd.DataFrame)
|
| 56 |
assert len(df) == 3
|
| 57 |
+
|
| 58 |
+
assert all(col in df.columns for col in DisplayColumns.values())
|
| 59 |
mock_download.assert_called_once()
|
| 60 |
|
| 61 |
@patch("src.leaderboard.input.download_github_file_content")
|
|
|
|
| 67 |
|
| 68 |
# Check that scores are formatted as strings with mean ± std
|
| 69 |
# df is sorted by score descending: exp_003 (0.912), exp_001 (0.854), exp_002 (0.789)
|
| 70 |
+
assert df.iloc[0][DisplayColumns.NORMALIZED_SCORE] == "0.912 ± 0.009"
|
| 71 |
+
assert df.iloc[1][DisplayColumns.NORMALIZED_SCORE] == "0.854 ± 0.012"
|
| 72 |
+
assert df.iloc[2][DisplayColumns.NORMALIZED_SCORE] == "0.789 ± 0.023"
|
| 73 |
# Check that scores are strings
|
| 74 |
+
assert isinstance(df.iloc[0][DisplayColumns.NORMALIZED_SCORE], str)
|
| 75 |
|
| 76 |
@patch("src.leaderboard.input.download_github_file_content")
|
| 77 |
def test_percentage_conversion(self, mock_download, sample_csv_data):
|
|
|
|
| 122 |
|
| 123 |
# Check that df is created correctly (extra columns should be filtered)
|
| 124 |
assert len(df) == 2
|
| 125 |
+
assert set(df.columns) == set(
|
| 126 |
+
DisplayColumns.values() + [RequiredInputColumns.MEAN_NORMALIZED_SCORE, RequiredInputColumns.MEAN_MEDAL_PCT]
|
| 127 |
+
)
|
| 128 |
# Verify the df doesn't have extra columns
|
| 129 |
assert "extra_col" not in df.columns
|
| 130 |
|
|
|
|
| 205 |
assert pd.isna(row_001[DisplayColumns.DATE])
|
| 206 |
assert row_002[DisplayColumns.DATE] == "2024-01-20"
|
| 207 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
|
| 209 |
class TestRefreshLeaderboard:
|
| 210 |
"""Tests for refresh_leaderboard function."""
|
| 211 |
|
| 212 |
+
@patch("app.load_leaderboard")
|
| 213 |
def test_refresh_leaderboard_success(self, mock_download):
|
| 214 |
"""Test that refresh_leaderboard returns dataframe and status message."""
|
| 215 |
# Setup mocks
|
|
|
|
| 218 |
DisplayColumns.EXPERIMENT_NAME: ["exp_001"],
|
| 219 |
DisplayColumns.AGENT: ["Agent A"],
|
| 220 |
DisplayColumns.LLM_USED: ["GPT-4"],
|
| 221 |
+
RequiredInputColumns.MEAN_NORMALIZED_SCORE: [0.850],
|
| 222 |
DisplayColumns.NORMALIZED_SCORE: ["0.850 ± 0.010"],
|
| 223 |
+
RequiredInputColumns.MEAN_MEDAL_PCT: [0.850],
|
| 224 |
DisplayColumns.ANY_MEDAL_SCORE: ["85.0 ± 1.0"],
|
| 225 |
DisplayColumns.DATE: ["2024-01-15"],
|
| 226 |
}
|
|
|
|
| 231 |
df, status = refresh_leaderboard()
|
| 232 |
|
| 233 |
# Assertions
|
| 234 |
+
assert df is not None
|
| 235 |
assert "Showing data from" in status
|
| 236 |
assert "GitHub" in status
|
| 237 |
# Check that status contains timestamp in expected format (YYYY-MM-DD HH:MM UTC)
|
|
|
|
| 244 |
assert re.search(timestamp_pattern, status) is not None
|
| 245 |
mock_download.assert_called_once()
|
| 246 |
|
| 247 |
+
@patch("app.load_leaderboard")
|
| 248 |
def test_refresh_leaderboard_includes_url(self, mock_download):
|
| 249 |
"""Test that status message includes the GitHub URL."""
|
| 250 |
mock_df = pd.DataFrame()
|
|
|
|
| 255 |
assert "github.com" in status.lower() or "GitHub" in status
|
| 256 |
assert "upgini/mle-bench" in status
|
| 257 |
|
| 258 |
+
@patch("app.load_leaderboard")
|
| 259 |
def test_refresh_leaderboard_propagates_error(self, mock_download):
|
| 260 |
"""Test that errors from download_leaderboard are propagated."""
|
| 261 |
mock_download.side_effect = requests.HTTPError("404 Not Found")
|