sunmarinup commited on
Commit
ab2d497
·
1 Parent(s): 1ecfc37

Account for git LFS

Browse files
Files changed (6) hide show
  1. Makefile +3 -0
  2. app.py +39 -6
  3. pyproject.toml +7 -0
  4. requirements.txt +1 -0
  5. tests/__init__.py +0 -0
  6. tests/test_leaderboard.py +321 -0
Makefile CHANGED
@@ -11,3 +11,6 @@ quality:
11
  python -m black --check --line-length 119 .
12
  python -m isort --check-only .
13
  ruff check .
 
 
 
 
11
  python -m black --check --line-length 119 .
12
  python -m isort --check-only .
13
  ruff check .
14
+
15
+ test:
16
+ pytest
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import io
2
  from datetime import datetime, timezone
3
 
@@ -5,7 +6,9 @@ import gradio as gr
5
  import pandas as pd
6
  import requests
7
 
8
- LEADERBOARD_URL = "https://raw.githubusercontent.com/upgini/mle-bench/main/" "rankings/low/tabular/overall_ranks.csv"
 
 
9
 
10
  DISPLAY_COLUMNS = [
11
  "experiment_id",
@@ -20,11 +23,41 @@ DISPLAY_COLUMNS = [
20
 
21
 
22
  def download_leaderboard() -> pd.DataFrame:
23
- """Download the remote leaderboard CSV and return a cleaned dataframe."""
24
- response = requests.get(LEADERBOARD_URL, timeout=30)
 
25
  response.raise_for_status()
26
 
27
- df = pd.read_csv(io.StringIO(response.text))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  missing_cols = [col for col in DISPLAY_COLUMNS if col not in df.columns]
29
  if missing_cols:
30
  raise ValueError(f"Leaderboard is missing expected columns: {', '.join(missing_cols)}")
@@ -42,7 +75,7 @@ def refresh_leaderboard():
42
  """Fetch the leaderboard and build the status message for the UI."""
43
  df = download_leaderboard()
44
  status = (
45
- f"Showing data from [GitHub]({LEADERBOARD_URL}). "
46
  f"Last refreshed: {datetime.now(timezone.utc):%Y-%m-%d %H:%M UTC}."
47
  )
48
  return df, status
@@ -71,4 +104,4 @@ with gr.Blocks(title="Upgini MLE-Bench Leaderboard") as demo:
71
  demo.load(refresh_leaderboard, outputs=[leaderboard_table, status_text])
72
  refresh_button.click(refresh_leaderboard, outputs=[leaderboard_table, status_text])
73
 
74
- demo.queue(concurrency_count=8).launch()
 
1
+ import base64
2
  import io
3
  from datetime import datetime, timezone
4
 
 
6
  import pandas as pd
7
  import requests
8
 
9
+ # GitHub API endpoint for the file (handles Git LFS files)
10
+ LEADERBOARD_API_URL = "https://api.github.com/repos/upgini/mle-bench/contents/rankings/low/tabular/overall_ranks.csv"
11
+ LEADERBOARD_GITHUB_URL = "https://github.com/upgini/mle-bench/blob/main/rankings/low/tabular/overall_ranks.csv"
12
 
13
  DISPLAY_COLUMNS = [
14
  "experiment_id",
 
23
 
24
 
25
  def download_leaderboard() -> pd.DataFrame:
26
+ """Download the remote leaderboard CSV from GitHub (handles Git LFS) and return a cleaned dataframe."""
27
+ # Use GitHub API to get file content (handles Git LFS files)
28
+ response = requests.get(LEADERBOARD_API_URL, timeout=30)
29
  response.raise_for_status()
30
 
31
+ api_data = response.json()
32
+
33
+ # Get file content - GitHub API handles Git LFS files
34
+ # If content is in the response, decode it; otherwise use download_url
35
+ if "content" in api_data:
36
+ # Decode base64 content
37
+ try:
38
+ csv_content = base64.b64decode(api_data["content"]).decode("utf-8")
39
+ except Exception as e:
40
+ raise ValueError(f"Failed to decode file content: {e}")
41
+
42
+ # Check if it's a Git LFS pointer file
43
+ if csv_content.startswith("version https://git-lfs.github.com/spec/v1"):
44
+ # For LFS files, use the download_url which points to the actual file
45
+ download_url = api_data.get("download_url")
46
+ if not download_url:
47
+ raise ValueError("Git LFS file found but no download_url available")
48
+ # Download the actual file content
49
+ lfs_response = requests.get(download_url, timeout=30)
50
+ lfs_response.raise_for_status()
51
+ csv_content = lfs_response.text
52
+ elif "download_url" in api_data:
53
+ # Large files don't include content, use download_url directly
54
+ download_response = requests.get(api_data["download_url"], timeout=30)
55
+ download_response.raise_for_status()
56
+ csv_content = download_response.text
57
+ else:
58
+ raise ValueError("No content or download_url found in API response")
59
+
60
+ df = pd.read_csv(io.StringIO(csv_content))
61
  missing_cols = [col for col in DISPLAY_COLUMNS if col not in df.columns]
62
  if missing_cols:
63
  raise ValueError(f"Leaderboard is missing expected columns: {', '.join(missing_cols)}")
 
75
  """Fetch the leaderboard and build the status message for the UI."""
76
  df = download_leaderboard()
77
  status = (
78
+ f"Showing data from [GitHub]({LEADERBOARD_GITHUB_URL}). "
79
  f"Last refreshed: {datetime.now(timezone.utc):%Y-%m-%d %H:%M UTC}."
80
  )
81
  return df, status
 
104
  demo.load(refresh_leaderboard, outputs=[leaderboard_table, status_text])
105
  refresh_button.click(refresh_leaderboard, outputs=[leaderboard_table, status_text])
106
 
107
+ demo.queue(default_concurrency_limit=8).launch()
pyproject.toml CHANGED
@@ -11,3 +11,10 @@ line_length = 119
11
 
12
  [tool.black]
13
  line-length = 119
 
 
 
 
 
 
 
 
11
 
12
  [tool.black]
13
  line-length = 119
14
+
15
+ [tool.pytest.ini_options]
16
+ testpaths = ["tests"]
17
+ python_files = ["test_*.py"]
18
+ python_classes = ["Test*"]
19
+ python_functions = ["test_*"]
20
+ addopts = "-v"
requirements.txt CHANGED
@@ -9,6 +9,7 @@ huggingface-hub>=0.18.0
9
  matplotlib
10
  numpy
11
  pandas
 
12
  requests
13
  python-dateutil
14
  tqdm
 
9
  matplotlib
10
  numpy
11
  pandas
12
+ pytest
13
  requests
14
  python-dateutil
15
  tqdm
tests/__init__.py ADDED
File without changes
tests/test_leaderboard.py ADDED
@@ -0,0 +1,321 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit tests for leaderboard functionality."""
2
+
3
+ import base64
4
+ from unittest.mock import Mock, patch
5
+
6
+ import pandas as pd
7
+ import pytest
8
+ import requests
9
+
10
+ from app import DISPLAY_COLUMNS, download_leaderboard, refresh_leaderboard
11
+
12
+
13
+ def create_github_api_response(csv_content, is_lfs_pointer=False, use_download_url=False):
14
+ """Helper to create a mock GitHub API response."""
15
+ api_response = Mock()
16
+ api_response.raise_for_status = Mock()
17
+
18
+ if use_download_url:
19
+ # For large files, API doesn't include content, only download_url
20
+ api_response.json.return_value = {
21
+ "download_url": "https://github.com/test/file.csv",
22
+ "sha": "test_sha",
23
+ }
24
+ # Second call for download_url
25
+ download_response = Mock()
26
+ download_response.text = csv_content
27
+ download_response.raise_for_status = Mock()
28
+ return [api_response, download_response]
29
+ elif is_lfs_pointer:
30
+ # Create a Git LFS pointer file
31
+ lfs_pointer = "version https://git-lfs.github.com/spec/v1\noid sha256:test123\nsize 100"
32
+ encoded_content = base64.b64encode(lfs_pointer.encode("utf-8")).decode("utf-8")
33
+ api_response.json.return_value = {
34
+ "content": encoded_content,
35
+ "download_url": "https://github.com/test/file.csv",
36
+ "sha": "test_sha",
37
+ }
38
+ # Second call for download_url (LFS files need to be downloaded)
39
+ download_response = Mock()
40
+ download_response.text = csv_content
41
+ download_response.raise_for_status = Mock()
42
+ return [api_response, download_response]
43
+ else:
44
+ encoded_content = base64.b64encode(csv_content.encode("utf-8")).decode("utf-8")
45
+ api_response.json.return_value = {
46
+ "content": encoded_content,
47
+ "download_url": "https://github.com/test/file.csv",
48
+ "sha": "test_sha",
49
+ }
50
+ return [api_response]
51
+
52
+
53
+ @pytest.fixture
54
+ def sample_csv_data():
55
+ """Sample CSV data matching the expected leaderboard format."""
56
+ return """experiment_id,mean_normalized_score,std_normalized_score,mean_medal_pct,sem_medal_pct,Agent,LLM(s) used,Date
57
+ exp_001,0.854321,0.012345,0.876543,0.009876,Agent A,GPT-4,2024-01-15
58
+ exp_002,0.789012,0.023456,0.765432,0.012345,Agent B,Claude-3,2024-01-20
59
+ exp_003,0.912345,0.008765,0.923456,0.007654,Agent C,GPT-4,2024-02-01"""
60
+
61
+
62
+ @pytest.fixture
63
+ def sample_csv_with_extra_columns():
64
+ """Sample CSV with extra columns that should be filtered out."""
65
+ return """experiment_id,mean_normalized_score,std_normalized_score,mean_medal_pct,sem_medal_pct,Agent,LLM(s) used,Date,extra_col
66
+ exp_001,0.854321,0.012345,0.876543,0.009876,Agent A,GPT-4,2024-01-15,extra_value
67
+ exp_002,0.789012,0.023456,0.765432,0.012345,Agent B,Claude-3,2024-01-20,extra_value"""
68
+
69
+
70
+ @pytest.fixture
71
+ def sample_csv_missing_columns():
72
+ """Sample CSV missing required columns."""
73
+ return """experiment_id,mean_normalized_score,Agent
74
+ exp_001,0.854321,Agent A
75
+ exp_002,0.789012,Agent B"""
76
+
77
+
78
+ class TestDownloadLeaderboard:
79
+ """Tests for download_leaderboard function."""
80
+
81
+ @patch("app.requests.get")
82
+ def test_successful_download(self, mock_get, sample_csv_data):
83
+ """Test successful download and parsing of leaderboard."""
84
+ # Setup mock GitHub API response
85
+ mock_responses = create_github_api_response(sample_csv_data)
86
+ mock_get.side_effect = mock_responses
87
+
88
+ # Execute
89
+ df = download_leaderboard()
90
+
91
+ # Assertions
92
+ assert isinstance(df, pd.DataFrame)
93
+ assert len(df) == 3
94
+ assert list(df.columns) == DISPLAY_COLUMNS
95
+ assert mock_get.call_count == 1
96
+
97
+ @patch("app.requests.get")
98
+ def test_data_cleaning_rounding(self, mock_get, sample_csv_data):
99
+ """Test that numeric columns are properly rounded."""
100
+ mock_responses = create_github_api_response(sample_csv_data)
101
+ mock_get.side_effect = mock_responses
102
+
103
+ df = download_leaderboard()
104
+
105
+ # Check rounding
106
+ assert df["mean_normalized_score"].dtype in [float, "float64"]
107
+ assert df["std_normalized_score"].dtype in [float, "float64"]
108
+ # Values should be rounded to 3 decimal places
109
+ assert df.loc[0, "mean_normalized_score"] == 0.912
110
+ assert df.loc[1, "mean_normalized_score"] == 0.854
111
+ assert df.loc[2, "mean_normalized_score"] == 0.789
112
+
113
+ @patch("app.requests.get")
114
+ def test_percentage_conversion(self, mock_get, sample_csv_data):
115
+ """Test that medal percentages are converted from decimal to percentage."""
116
+ mock_responses = create_github_api_response(sample_csv_data)
117
+ mock_get.side_effect = mock_responses
118
+
119
+ df = download_leaderboard()
120
+
121
+ # Check percentage conversion (0.876543 * 100 = 87.6543, rounded to 87.7)
122
+ assert df.loc[1, "mean_medal_pct"] == 87.7
123
+ assert df.loc[0, "mean_medal_pct"] == 92.3
124
+ assert df.loc[2, "mean_medal_pct"] == 76.5
125
+
126
+ @patch("app.requests.get")
127
+ def test_date_formatting(self, mock_get, sample_csv_data):
128
+ """Test that dates are properly formatted."""
129
+ mock_responses = create_github_api_response(sample_csv_data)
130
+ mock_get.side_effect = mock_responses
131
+
132
+ df = download_leaderboard()
133
+
134
+ # Check date formatting
135
+ assert df.loc[0, "Date"] == "2024-02-01"
136
+ assert df.loc[1, "Date"] == "2024-01-15"
137
+ assert df.loc[2, "Date"] == "2024-01-20"
138
+
139
+ @patch("app.requests.get")
140
+ def test_sorting(self, mock_get, sample_csv_data):
141
+ """Test that dataframe is sorted by mean_normalized_score descending."""
142
+ mock_responses = create_github_api_response(sample_csv_data)
143
+ mock_get.side_effect = mock_responses
144
+
145
+ df = download_leaderboard()
146
+
147
+ # Check sorting (highest score first)
148
+ scores = df["mean_normalized_score"].tolist()
149
+ assert scores == sorted(scores, reverse=True)
150
+ assert df.loc[0, "experiment_id"] == "exp_003" # Highest score
151
+ assert df.loc[2, "experiment_id"] == "exp_002" # Lowest score
152
+
153
+ @patch("app.requests.get")
154
+ def test_extra_columns_filtered(self, mock_get, sample_csv_with_extra_columns):
155
+ """Test that extra columns are filtered out."""
156
+ mock_responses = create_github_api_response(sample_csv_with_extra_columns)
157
+ mock_get.side_effect = mock_responses
158
+
159
+ df = download_leaderboard()
160
+
161
+ # Check that only display columns are present
162
+ assert list(df.columns) == DISPLAY_COLUMNS
163
+ assert "extra_col" not in df.columns
164
+
165
+ @patch("app.requests.get")
166
+ def test_missing_columns_error(self, mock_get, sample_csv_missing_columns):
167
+ """Test that missing required columns raise ValueError."""
168
+ mock_responses = create_github_api_response(sample_csv_missing_columns)
169
+ mock_get.side_effect = mock_responses
170
+
171
+ with pytest.raises(ValueError, match="Leaderboard is missing expected columns"):
172
+ download_leaderboard()
173
+
174
+ @patch("app.requests.get")
175
+ def test_http_error(self, mock_get):
176
+ """Test handling of HTTP errors."""
177
+ mock_response = Mock()
178
+ mock_response.raise_for_status.side_effect = requests.HTTPError("404 Not Found")
179
+ mock_get.return_value = mock_response
180
+
181
+ with pytest.raises(requests.HTTPError):
182
+ download_leaderboard()
183
+
184
+ @patch("app.requests.get")
185
+ def test_network_error(self, mock_get):
186
+ """Test handling of network errors."""
187
+ mock_get.side_effect = requests.ConnectionError("Connection failed")
188
+
189
+ with pytest.raises(requests.ConnectionError):
190
+ download_leaderboard()
191
+
192
+ @patch("app.requests.get")
193
+ def test_timeout_handling(self, mock_get):
194
+ """Test that timeout parameter is passed correctly."""
195
+ csv_data = "experiment_id,mean_normalized_score,std_normalized_score,mean_medal_pct,sem_medal_pct,Agent,LLM(s) used,Date\nexp_001,0.85,0.01,0.87,0.01,Agent A,GPT-4,2024-01-15"
196
+ mock_responses = create_github_api_response(csv_data)
197
+ mock_get.side_effect = mock_responses
198
+
199
+ download_leaderboard()
200
+
201
+ # Verify timeout was passed
202
+ assert mock_get.call_count >= 1
203
+ # Check that timeout is in the first call (API call)
204
+ call_kwargs = mock_get.call_args_list[0][1]
205
+ assert call_kwargs["timeout"] == 30
206
+
207
+ @patch("app.requests.get")
208
+ def test_empty_dataframe(self, mock_get):
209
+ """Test handling of empty CSV (header only)."""
210
+ csv_data = ",".join(DISPLAY_COLUMNS) # Header only
211
+ mock_responses = create_github_api_response(csv_data)
212
+ mock_get.side_effect = mock_responses
213
+
214
+ df = download_leaderboard()
215
+
216
+ assert isinstance(df, pd.DataFrame)
217
+ assert len(df) == 0
218
+ assert list(df.columns) == DISPLAY_COLUMNS
219
+
220
+ @patch("app.requests.get")
221
+ def test_invalid_date_handling(self, mock_get):
222
+ """Test that invalid dates are handled gracefully."""
223
+ csv_with_invalid_date = """experiment_id,mean_normalized_score,std_normalized_score,mean_medal_pct,sem_medal_pct,Agent,LLM(s) used,Date
224
+ exp_001,0.854321,0.012345,0.876543,0.009876,Agent A,GPT-4,invalid-date
225
+ exp_002,0.789012,0.023456,0.765432,0.012345,Agent B,Claude-3,2024-01-20"""
226
+ mock_responses = create_github_api_response(csv_with_invalid_date)
227
+ mock_get.side_effect = mock_responses
228
+
229
+ df = download_leaderboard()
230
+
231
+ # Invalid dates should become NaT and then empty string or NaN
232
+ assert pd.isna(df.loc[0, "Date"]) or df.loc[0, "Date"] == ""
233
+ assert df.loc[1, "Date"] == "2024-01-20"
234
+
235
+ @patch("app.requests.get")
236
+ def test_git_lfs_pointer_file(self, mock_get, sample_csv_data):
237
+ """Test handling of Git LFS pointer files."""
238
+ # First response: API with LFS pointer
239
+ mock_responses = create_github_api_response(sample_csv_data, is_lfs_pointer=True)
240
+ # Add download response for LFS file
241
+ download_response = Mock()
242
+ download_response.text = sample_csv_data
243
+ download_response.raise_for_status = Mock()
244
+ mock_responses.append(download_response)
245
+ mock_get.side_effect = mock_responses
246
+
247
+ df = download_leaderboard()
248
+
249
+ # Should successfully download via download_url
250
+ assert isinstance(df, pd.DataFrame)
251
+ assert len(df) == 3
252
+ assert list(df.columns) == DISPLAY_COLUMNS
253
+ # Should make 2 calls: API call + download_url call
254
+ assert mock_get.call_count == 2
255
+
256
+ @patch("app.requests.get")
257
+ def test_large_file_download_url(self, mock_get, sample_csv_data):
258
+ """Test handling of large files that only have download_url."""
259
+ mock_responses = create_github_api_response(sample_csv_data, use_download_url=True)
260
+ mock_get.side_effect = mock_responses
261
+
262
+ df = download_leaderboard()
263
+
264
+ assert isinstance(df, pd.DataFrame)
265
+ assert len(df) == 3
266
+ assert list(df.columns) == DISPLAY_COLUMNS
267
+ # Should make 2 calls: API call + download_url call
268
+ assert mock_get.call_count == 2
269
+
270
+
271
+ class TestRefreshLeaderboard:
272
+ """Tests for refresh_leaderboard function."""
273
+
274
+ @patch("app.download_leaderboard")
275
+ def test_refresh_leaderboard_success(self, mock_download):
276
+ """Test that refresh_leaderboard returns dataframe and status message."""
277
+ # Setup mocks
278
+ mock_df = pd.DataFrame(
279
+ {
280
+ "experiment_id": ["exp_001"],
281
+ "mean_normalized_score": [0.85],
282
+ "Agent": ["Agent A"],
283
+ }
284
+ )
285
+ mock_download.return_value = mock_df
286
+
287
+ # Execute
288
+ df, status = refresh_leaderboard()
289
+
290
+ # Assertions
291
+ assert df is mock_df
292
+ assert "Showing data from" in status
293
+ assert "GitHub" in status
294
+ # Check that status contains timestamp in expected format (YYYY-MM-DD HH:MM UTC)
295
+ assert "UTC" in status
296
+ assert "Last refreshed:" in status
297
+ # Verify timestamp format (should match pattern YYYY-MM-DD HH:MM)
298
+ import re
299
+
300
+ timestamp_pattern = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2} UTC"
301
+ assert re.search(timestamp_pattern, status) is not None
302
+ mock_download.assert_called_once()
303
+
304
+ @patch("app.download_leaderboard")
305
+ def test_refresh_leaderboard_includes_url(self, mock_download):
306
+ """Test that status message includes the GitHub URL."""
307
+ mock_df = pd.DataFrame()
308
+ mock_download.return_value = mock_df
309
+
310
+ df, status = refresh_leaderboard()
311
+
312
+ assert "github.com" in status.lower() or "GitHub" in status
313
+ assert "upgini/mle-bench" in status
314
+
315
+ @patch("app.download_leaderboard")
316
+ def test_refresh_leaderboard_propagates_error(self, mock_download):
317
+ """Test that errors from download_leaderboard are propagated."""
318
+ mock_download.side_effect = requests.HTTPError("404 Not Found")
319
+
320
+ with pytest.raises(requests.HTTPError):
321
+ refresh_leaderboard()