openhands commited on
Commit
d05a70e
·
1 Parent(s): 1ca9fc3

Add REST API for programmatic access to leaderboard data

Browse files

- Add api.py with FastAPI endpoints for leaderboard data
- Mount API on root path using gr.mount_gradio_app()
- Add API documentation link to About page Resources section
- API uses same data loading functions as UI for consistency

Endpoints:
- /health - Health check with cache status
- /leaderboard - Full leaderboard with filtering/sorting
- /leaderboard/models - List all models
- /leaderboard/model/{name} - Get specific model
- /categories - List benchmark categories
- /benchmarks - List benchmarks
- /docs - Swagger UI documentation

Files changed (4) hide show
  1. about.py +1 -0
  2. api.py +356 -0
  3. app.py +10 -8
  4. requirements.txt +1 -0
about.py CHANGED
@@ -51,6 +51,7 @@ def build_page():
51
  <li><a href="https://github.com/OpenHands/software-agent-sdk" target="_blank">Software Agent SDK</a> - The agent code used for evaluation</li>
52
  <li><a href="https://github.com/OpenHands/benchmarks" target="_blank">Benchmarks</a> - The benchmarking code</li>
53
  <li><a href="https://github.com/OpenHands/openhands-index-results" target="_blank">Results</a> - Raw evaluation results</li>
 
54
  </ul>
55
  """
56
  )
 
51
  <li><a href="https://github.com/OpenHands/software-agent-sdk" target="_blank">Software Agent SDK</a> - The agent code used for evaluation</li>
52
  <li><a href="https://github.com/OpenHands/benchmarks" target="_blank">Benchmarks</a> - The benchmarking code</li>
53
  <li><a href="https://github.com/OpenHands/openhands-index-results" target="_blank">Results</a> - Raw evaluation results</li>
54
+ <li><a href="/api/docs" target="_blank">API Documentation</a> - REST API for programmatic access to leaderboard data</li>
55
  </ul>
56
  """
57
  )
api.py ADDED
@@ -0,0 +1,356 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ REST API for OpenHands Index leaderboard data.
3
+
4
+ This module provides API endpoints that use the same data loading functions
5
+ as the Gradio UI, ensuring consistency between the web interface and API responses.
6
+ """
7
+
8
+ import logging
9
+ import math
10
+ from datetime import datetime
11
+ from typing import Optional, Any
12
+
13
+ from fastapi import FastAPI, Query, HTTPException
14
+ from fastapi.middleware.cors import CORSMiddleware
15
+ from fastapi.responses import HTMLResponse
16
+
17
+ from simple_data_loader import SimpleLeaderboardViewer
18
+ from config import CONFIG_NAME, EXTRACTED_DATA_DIR
19
+ from setup_data import _last_fetch_time, CACHE_TTL_SECONDS
20
+ import os
21
+
22
+
23
+ def _sanitize_value(val: Any) -> Any:
24
+ """Convert NaN/inf values to None for JSON serialization."""
25
+ if val is None:
26
+ return None
27
+ if isinstance(val, float):
28
+ if math.isnan(val) or math.isinf(val):
29
+ return None
30
+ return val
31
+
32
+
33
+ def _sanitize_dict(d: dict) -> dict:
34
+ """Recursively sanitize a dictionary for JSON serialization."""
35
+ result = {}
36
+ for k, v in d.items():
37
+ if isinstance(v, dict):
38
+ result[k] = _sanitize_dict(v)
39
+ elif isinstance(v, list):
40
+ result[k] = [_sanitize_dict(i) if isinstance(i, dict) else _sanitize_value(i) for i in v]
41
+ else:
42
+ result[k] = _sanitize_value(v)
43
+ return result
44
+
45
+ logger = logging.getLogger(__name__)
46
+
47
+ # Create FastAPI app for API endpoints
48
+ api_app = FastAPI(
49
+ title="OpenHands Index API",
50
+ description="""
51
+ REST API for accessing OpenHands Index benchmark results.
52
+
53
+ The OpenHands Index is a comprehensive benchmark for evaluating AI coding agents
54
+ across real-world software engineering tasks. It assesses models across five categories:
55
+
56
+ - **Issue Resolution**: Fixing bugs (SWE-Bench)
57
+ - **Greenfield**: Building new applications (Commit0)
58
+ - **Frontend**: UI development (SWE-Bench Multimodal)
59
+ - **Testing**: Test generation (SWT-Bench)
60
+ - **Information Gathering**: Research tasks (GAIA)
61
+
62
+ This API provides the same data that powers the leaderboard UI.
63
+ """,
64
+ version="1.0.0",
65
+ docs_url="/docs",
66
+ redoc_url="/redoc",
67
+ )
68
+
69
+ api_app.add_middleware(
70
+ CORSMiddleware,
71
+ allow_origins=["*"],
72
+ allow_credentials=True,
73
+ allow_methods=["*"],
74
+ allow_headers=["*"],
75
+ )
76
+
77
+ # Benchmark to category mappings (same as simple_data_loader.py)
78
+ BENCHMARK_TO_CATEGORIES = {
79
+ 'swe-bench': ['Issue Resolution'],
80
+ 'swe-bench-multimodal': ['Frontend'],
81
+ 'commit0': ['Greenfield'],
82
+ 'swt-bench': ['Testing'],
83
+ 'gaia': ['Information Gathering'],
84
+ }
85
+
86
+ ALL_CATEGORIES = ['Issue Resolution', 'Frontend', 'Greenfield', 'Testing', 'Information Gathering']
87
+
88
+ CATEGORY_DESCRIPTIONS = {
89
+ "Issue Resolution": "Fixing bugs in real GitHub issues (SWE-Bench)",
90
+ "Greenfield": "Building new applications from scratch (Commit0)",
91
+ "Frontend": "UI development with visual context (SWE-Bench Multimodal)",
92
+ "Testing": "Test generation and quality (SWT-Bench)",
93
+ "Information Gathering": "Research and information retrieval (GAIA)",
94
+ }
95
+
96
+ # Openness mapping (same as aliases.py)
97
+ OPENNESS_MAPPING = {
98
+ 'open': 'open',
99
+ 'open_weights': 'open',
100
+ 'open_weights_open_data': 'open',
101
+ 'closed': 'closed',
102
+ 'closed_api_available': 'closed',
103
+ 'closed_api_unavailable': 'closed',
104
+ }
105
+
106
+
107
+ def _get_leaderboard_data() -> dict:
108
+ """
109
+ Load leaderboard data using the same SimpleLeaderboardViewer used by the UI.
110
+ This ensures API responses match what's displayed in the Gradio interface.
111
+ """
112
+ try:
113
+ data_dir = EXTRACTED_DATA_DIR if os.path.exists(EXTRACTED_DATA_DIR) else "mock_results"
114
+ viewer = SimpleLeaderboardViewer(
115
+ data_dir=data_dir,
116
+ config=CONFIG_NAME,
117
+ split="test"
118
+ )
119
+
120
+ raw_df, tag_map = viewer._load()
121
+
122
+ if raw_df is None or raw_df.empty or "Message" in raw_df.columns:
123
+ return {"entries": [], "error": "No data available"}
124
+
125
+ entries = []
126
+ for _, row in raw_df.iterrows():
127
+ # Normalize openness
128
+ raw_openness = row.get('openness', 'unknown')
129
+ normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
130
+
131
+ entry = {
132
+ "id": row.get('id'),
133
+ "language_model": row.get('Language model'),
134
+ "sdk_version": row.get('SDK version'),
135
+ "openness": normalized_openness,
136
+ "average_score": row.get('average score'),
137
+ "average_cost": row.get('average cost'),
138
+ "average_runtime": row.get('average runtime'),
139
+ "categories_completed": row.get('categories_completed', 0),
140
+ "release_date": row.get('release_date'),
141
+ "benchmarks": {},
142
+ "categories": {},
143
+ }
144
+
145
+ # Add benchmark-level data
146
+ for benchmark in BENCHMARK_TO_CATEGORIES.keys():
147
+ score_col = f'{benchmark} score'
148
+ cost_col = f'{benchmark} cost'
149
+ runtime_col = f'{benchmark} runtime'
150
+ download_col = f'{benchmark} download'
151
+ viz_col = f'{benchmark} visualization'
152
+
153
+ if score_col in row and row[score_col] is not None:
154
+ entry["benchmarks"][benchmark] = {
155
+ "score": row.get(score_col),
156
+ "cost": row.get(cost_col),
157
+ "runtime": row.get(runtime_col),
158
+ "download_url": row.get(download_col),
159
+ "visualization_url": row.get(viz_col),
160
+ }
161
+
162
+ # Add category-level data
163
+ for category in ALL_CATEGORIES:
164
+ score_col = f'{category} score'
165
+ cost_col = f'{category} cost'
166
+ runtime_col = f'{category} runtime'
167
+
168
+ if score_col in row and row[score_col] is not None:
169
+ entry["categories"][category] = {
170
+ "score": row.get(score_col),
171
+ "cost": row.get(cost_col),
172
+ "runtime": row.get(runtime_col),
173
+ }
174
+
175
+ # Sanitize the entry to handle NaN values
176
+ entries.append(_sanitize_dict(entry))
177
+
178
+ # Sort by average score descending
179
+ entries.sort(key=lambda x: x.get('average_score') or 0, reverse=True)
180
+
181
+ return {
182
+ "entries": entries,
183
+ "total_count": len(entries),
184
+ "fetched_at": _last_fetch_time.isoformat() if _last_fetch_time else None,
185
+ }
186
+
187
+ except Exception as e:
188
+ logger.error(f"Error loading leaderboard data: {e}")
189
+ return {"entries": [], "error": str(e)}
190
+
191
+
192
+ @api_app.get("/", tags=["Info"])
193
+ async def api_root():
194
+ """API information and available endpoints."""
195
+ return {
196
+ "name": "OpenHands Index API",
197
+ "version": "1.0.0",
198
+ "description": "REST API for accessing OpenHands Index benchmark results",
199
+ "documentation": "/api/docs",
200
+ "endpoints": {
201
+ "/api/leaderboard": "Get the full leaderboard with scores and metadata",
202
+ "/api/leaderboard/models": "List all language models in the leaderboard",
203
+ "/api/leaderboard/model/{model_name}": "Get data for a specific model",
204
+ "/api/categories": "List all benchmark categories",
205
+ "/api/benchmarks": "List all benchmarks",
206
+ "/api/health": "Health check endpoint",
207
+ }
208
+ }
209
+
210
+
211
+ @api_app.get("/health", tags=["Health"])
212
+ async def health_check():
213
+ """Check API health status and cache information."""
214
+ cache_age = None
215
+ if _last_fetch_time is not None:
216
+ cache_age = (datetime.now() - _last_fetch_time).total_seconds()
217
+
218
+ return {
219
+ "status": "healthy",
220
+ "version": "1.0.0",
221
+ "cache_ttl_seconds": CACHE_TTL_SECONDS,
222
+ "cache_age_seconds": cache_age,
223
+ "last_fetch_time": _last_fetch_time.isoformat() if _last_fetch_time else None,
224
+ }
225
+
226
+
227
+ @api_app.get("/leaderboard", tags=["Leaderboard"])
228
+ async def get_leaderboard(
229
+ openness: Optional[str] = Query(None, description="Filter by openness (open/closed)"),
230
+ min_categories: Optional[int] = Query(None, description="Minimum categories completed"),
231
+ sort_by: str = Query("average_score", description="Sort field (average_score, average_cost, average_runtime)"),
232
+ limit: Optional[int] = Query(None, description="Limit number of results"),
233
+ ):
234
+ """
235
+ Get the full leaderboard with benchmark scores and metadata.
236
+
237
+ Returns the same data displayed in the OpenHands Index UI leaderboard.
238
+ """
239
+ data = _get_leaderboard_data()
240
+
241
+ if "error" in data and data.get("entries") == []:
242
+ raise HTTPException(status_code=503, detail=data["error"])
243
+
244
+ entries = data.get("entries", [])
245
+
246
+ # Apply filters
247
+ if openness:
248
+ entries = [e for e in entries if e.get("openness") == openness]
249
+
250
+ if min_categories is not None:
251
+ entries = [e for e in entries if (e.get("categories_completed") or 0) >= min_categories]
252
+
253
+ # Apply sorting
254
+ reverse = True
255
+ if sort_by in ["average_cost", "average_runtime"]:
256
+ reverse = False # Lower is better
257
+
258
+ entries.sort(
259
+ key=lambda x: x.get(sort_by) if x.get(sort_by) is not None else (float('inf') if not reverse else float('-inf')),
260
+ reverse=reverse
261
+ )
262
+
263
+ # Apply limit
264
+ if limit:
265
+ entries = entries[:limit]
266
+
267
+ return {
268
+ "entries": entries,
269
+ "total_count": len(entries),
270
+ "categories": ALL_CATEGORIES,
271
+ "benchmarks": list(BENCHMARK_TO_CATEGORIES.keys()),
272
+ "fetched_at": data.get("fetched_at"),
273
+ }
274
+
275
+
276
+ @api_app.get("/leaderboard/models", tags=["Leaderboard"])
277
+ async def list_models(
278
+ openness: Optional[str] = Query(None, description="Filter by openness (open/closed)"),
279
+ ):
280
+ """List all language models available in the leaderboard."""
281
+ data = _get_leaderboard_data()
282
+ entries = data.get("entries", [])
283
+
284
+ if openness:
285
+ entries = [e for e in entries if e.get("openness") == openness]
286
+
287
+ models = [
288
+ {
289
+ "language_model": e.get("language_model"),
290
+ "sdk_version": e.get("sdk_version"),
291
+ "openness": e.get("openness"),
292
+ "average_score": e.get("average_score"),
293
+ "categories_completed": e.get("categories_completed"),
294
+ }
295
+ for e in entries
296
+ ]
297
+
298
+ return {
299
+ "models": models,
300
+ "total_count": len(models),
301
+ }
302
+
303
+
304
+ @api_app.get("/leaderboard/model/{model_name}", tags=["Leaderboard"])
305
+ async def get_model(model_name: str):
306
+ """Get detailed data for a specific language model."""
307
+ data = _get_leaderboard_data()
308
+ entries = data.get("entries", [])
309
+
310
+ # Find entries matching the model name (case-insensitive)
311
+ matching = [e for e in entries if (e.get("language_model") or "").lower() == model_name.lower()]
312
+
313
+ if not matching:
314
+ raise HTTPException(status_code=404, detail=f"Model '{model_name}' not found")
315
+
316
+ return {
317
+ "model_name": model_name,
318
+ "entries": matching,
319
+ "count": len(matching),
320
+ }
321
+
322
+
323
+ @api_app.get("/categories", tags=["Metadata"])
324
+ async def list_categories():
325
+ """List all benchmark categories with their associated benchmarks."""
326
+ category_to_benchmarks = {}
327
+ for benchmark, categories in BENCHMARK_TO_CATEGORIES.items():
328
+ for category in categories:
329
+ if category not in category_to_benchmarks:
330
+ category_to_benchmarks[category] = []
331
+ category_to_benchmarks[category].append(benchmark)
332
+
333
+ return {
334
+ "categories": [
335
+ {
336
+ "name": category,
337
+ "description": CATEGORY_DESCRIPTIONS.get(category, ""),
338
+ "benchmarks": category_to_benchmarks.get(category, [])
339
+ }
340
+ for category in ALL_CATEGORIES
341
+ ]
342
+ }
343
+
344
+
345
+ @api_app.get("/benchmarks", tags=["Metadata"])
346
+ async def list_benchmarks():
347
+ """List all benchmarks with their category mappings."""
348
+ return {
349
+ "benchmarks": [
350
+ {
351
+ "name": benchmark,
352
+ "categories": categories
353
+ }
354
+ for benchmark, categories in BENCHMARK_TO_CATEGORIES.items()
355
+ ]
356
+ }
app.py CHANGED
@@ -378,17 +378,19 @@ with demo.route("About", "/about"):
378
 
379
  logger.info("All routes configured")
380
 
 
 
 
 
381
 
382
- # Launch the Gradio app
 
383
  if __name__ == "__main__":
 
384
  # Respect platform port/host if provided (e.g., OpenHands runtime)
385
  port = int(os.environ.get("PORT", os.environ.get("GRADIO_SERVER_PORT", 7860)))
386
  host = os.environ.get("HOST", os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0"))
387
- if LOCAL_DEBUG:
388
- logger.info("Launching in LOCAL_DEBUG mode")
389
- demo.launch(server_name=host, server_port=port, debug=True, allowed_paths=["assets"], favicon_path="assets/favicon/favicon.ico")
390
- else:
391
- logger.info("Launching in Space mode")
392
- demo.launch(server_name=host, server_port=port, debug=True, share=False, allowed_paths=["assets"], favicon_path="assets/favicon/favicon.ico")
393
- logger.info("Gradio app launched successfully")
394
 
 
378
 
379
  logger.info("All routes configured")
380
 
381
+ # Mount the REST API on /api
382
+ from api import api_app
383
+ app = gr.mount_gradio_app(api_app, demo, path="/")
384
+ logger.info("REST API mounted, Gradio app mounted at /")
385
 
386
+
387
+ # Launch the app
388
  if __name__ == "__main__":
389
+ import uvicorn
390
  # Respect platform port/host if provided (e.g., OpenHands runtime)
391
  port = int(os.environ.get("PORT", os.environ.get("GRADIO_SERVER_PORT", 7860)))
392
  host = os.environ.get("HOST", os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0"))
393
+ logger.info(f"Launching app on {host}:{port}")
394
+ uvicorn.run(app, host=host, port=port)
395
+ logger.info("App launched successfully")
 
 
 
 
396
 
requirements.txt CHANGED
@@ -4,6 +4,7 @@ pandas==2.2.3
4
  plotly==6.0.1
5
  requests==2.32.3
6
  huggingface-hub==0.30.2
 
7
 
8
  # Additional dependencies for UI and processing
9
  matplotlib==3.10.3
 
4
  plotly==6.0.1
5
  requests==2.32.3
6
  huggingface-hub==0.30.2
7
+ fastapi>=0.104.0
8
 
9
  # Additional dependencies for UI and processing
10
  matplotlib==3.10.3