openhands commited on
Commit
6bddf26
·
1 Parent(s): d68f190

Add periodic cache refresh for leaderboard data

Browse files

- Add background refresh scheduler that checks for new data every hour
- Implement thread-safe caching with configurable TTL (CACHE_TTL_SECONDS env var)
- Register cache clear callback to invalidate viewer cache when data refreshes
- Add refresh_data_if_needed() function for on-demand refresh
- Data is fetched from GitHub repo and cached, then periodically refreshed

This ensures the leaderboard shows fresh data while maintaining good performance
through caching.

Files changed (3) hide show
  1. app.py +5 -1
  2. setup_data.py +166 -6
  3. ui_components.py +34 -10
app.py CHANGED
@@ -11,9 +11,13 @@ logger.info("Starting OpenHands Index application")
11
 
12
  # Setup mock data before anything else
13
  try:
14
- from setup_data import setup_mock_data
15
  setup_mock_data()
16
  logger.info("Data setup completed successfully")
 
 
 
 
17
  except Exception as e:
18
  logger.error(f"Error during data setup: {e}", exc_info=True)
19
  logger.warning("Continuing with app startup despite error")
 
11
 
12
  # Setup mock data before anything else
13
  try:
14
+ from setup_data import setup_mock_data, start_background_refresh, CACHE_TTL_SECONDS
15
  setup_mock_data()
16
  logger.info("Data setup completed successfully")
17
+
18
+ # Start background refresh scheduler (checks for new data every hour)
19
+ start_background_refresh()
20
+ logger.info(f"Background refresh scheduler started (interval: {CACHE_TTL_SECONDS}s)")
21
  except Exception as e:
22
  logger.error(f"Error during data setup: {e}", exc_info=True)
23
  logger.warning("Continuing with app startup despite error")
setup_data.py CHANGED
@@ -1,18 +1,33 @@
1
  """
2
  Setup script to fetch data from GitHub repository or use mock data as fallback.
3
  This runs before the app starts to ensure data is available.
 
4
  """
5
  import os
6
  import shutil
7
  import subprocess
8
  import sys
 
 
 
9
  from pathlib import Path
 
10
  from config import DATA_DIR, EXTRACTED_DATA_DIR, CONFIG_NAME
11
 
 
 
12
  GITHUB_REPO = "https://github.com/OpenHands/openhands-index-results.git"
13
  # Keep the full repo clone so we can use pydantic models from scripts/
14
  REPO_CLONE_DIR = Path("/tmp/openhands-index-results")
15
 
 
 
 
 
 
 
 
 
16
 
17
  def get_repo_clone_dir() -> Path:
18
  """Get the path to the cloned openhands-index-results repository."""
@@ -135,28 +150,118 @@ def copy_mock_data():
135
  print(f"Target directory: {target_dir.absolute()}")
136
  return True
137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  def setup_mock_data():
139
  """
140
  Setup data for the leaderboard.
141
  First tries to fetch from GitHub, falls back to mock data if unavailable.
142
  """
 
 
143
  print("=" * 60)
144
  print("STARTING DATA SETUP")
145
  print("=" * 60)
146
 
147
  target_dir = Path(EXTRACTED_DATA_DIR) / CONFIG_NAME
148
 
149
- # Check if data already exists
150
- if target_dir.exists() and any(target_dir.glob("*.jsonl")):
151
- jsonl_files = list(target_dir.glob("*.jsonl"))
152
- print(f"Data already exists at {target_dir}")
153
- print(f"Found {len(jsonl_files)} JSONL files: {[f.name for f in jsonl_files]}")
154
- return
 
 
 
 
 
155
 
156
  # Try to fetch from GitHub first
157
  print("\n--- Attempting to fetch from GitHub ---")
158
  try:
159
  if fetch_data_from_github():
 
160
  print("✓ Successfully using data from GitHub repository")
161
  return
162
  except Exception as e:
@@ -166,6 +271,7 @@ def setup_mock_data():
166
  print("\n--- GitHub data not available, falling back to mock data ---")
167
  try:
168
  if copy_mock_data():
 
169
  print("✓ Successfully using mock data")
170
  return
171
  except Exception as e:
@@ -175,5 +281,59 @@ def setup_mock_data():
175
  print("ERROR: No data available! Neither GitHub nor mock data could be loaded.")
176
  print("!" * 60)
177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  if __name__ == "__main__":
179
  setup_mock_data()
 
1
  """
2
  Setup script to fetch data from GitHub repository or use mock data as fallback.
3
  This runs before the app starts to ensure data is available.
4
+ Supports periodic refresh with caching.
5
  """
6
  import os
7
  import shutil
8
  import subprocess
9
  import sys
10
+ import threading
11
+ import time
12
+ import logging
13
  from pathlib import Path
14
+ from datetime import datetime, timedelta
15
  from config import DATA_DIR, EXTRACTED_DATA_DIR, CONFIG_NAME
16
 
17
+ logger = logging.getLogger(__name__)
18
+
19
  GITHUB_REPO = "https://github.com/OpenHands/openhands-index-results.git"
20
  # Keep the full repo clone so we can use pydantic models from scripts/
21
  REPO_CLONE_DIR = Path("/tmp/openhands-index-results")
22
 
23
+ # Cache management
24
+ _last_fetch_time = None
25
+ _fetch_lock = threading.Lock()
26
+ _refresh_callbacks = [] # Callbacks to call after data refresh
27
+
28
+ # Cache TTL can be configured via environment variable (default: 1 hour = 3600 seconds)
29
+ CACHE_TTL_SECONDS = int(os.environ.get("CACHE_TTL_SECONDS", 3600))
30
+
31
 
32
  def get_repo_clone_dir() -> Path:
33
  """Get the path to the cloned openhands-index-results repository."""
 
150
  print(f"Target directory: {target_dir.absolute()}")
151
  return True
152
 
153
+ def register_refresh_callback(callback):
154
+ """
155
+ Register a callback to be called after data is refreshed.
156
+ The callback should clear any caches that depend on the data.
157
+ """
158
+ global _refresh_callbacks
159
+ if callback not in _refresh_callbacks:
160
+ _refresh_callbacks.append(callback)
161
+
162
+
163
+ def _notify_refresh_callbacks():
164
+ """Notify all registered callbacks that data has been refreshed."""
165
+ global _refresh_callbacks
166
+ for callback in _refresh_callbacks:
167
+ try:
168
+ callback()
169
+ except Exception as e:
170
+ logger.warning(f"Error in refresh callback: {e}")
171
+
172
+
173
+ def get_last_fetch_time():
174
+ """Get the timestamp of the last successful data fetch."""
175
+ global _last_fetch_time
176
+ return _last_fetch_time
177
+
178
+
179
+ def is_cache_stale():
180
+ """Check if the cache has expired (older than CACHE_TTL_SECONDS)."""
181
+ global _last_fetch_time
182
+ if _last_fetch_time is None:
183
+ return True
184
+ return datetime.now() - _last_fetch_time > timedelta(seconds=CACHE_TTL_SECONDS)
185
+
186
+
187
+ def refresh_data_if_needed(force: bool = False) -> bool:
188
+ """
189
+ Refresh data from GitHub if the cache is stale or if forced.
190
+
191
+ Args:
192
+ force: If True, refresh regardless of cache age
193
+
194
+ Returns:
195
+ True if data was refreshed, False otherwise
196
+ """
197
+ global _last_fetch_time, _fetch_lock
198
+
199
+ if not force and not is_cache_stale():
200
+ return False
201
+
202
+ # Use lock to prevent concurrent refreshes
203
+ if not _fetch_lock.acquire(blocking=False):
204
+ logger.info("Another refresh is in progress, skipping...")
205
+ return False
206
+
207
+ try:
208
+ logger.info("Refreshing data from GitHub...")
209
+
210
+ # Remove old data to force re-fetch
211
+ target_dir = Path(EXTRACTED_DATA_DIR) / CONFIG_NAME
212
+ if target_dir.exists():
213
+ shutil.rmtree(target_dir)
214
+
215
+ # Fetch new data
216
+ if fetch_data_from_github():
217
+ _last_fetch_time = datetime.now()
218
+ logger.info(f"✓ Data refreshed successfully at {_last_fetch_time}")
219
+ _notify_refresh_callbacks()
220
+ return True
221
+ else:
222
+ # If GitHub fails, try mock data as fallback
223
+ logger.warning("GitHub fetch failed, trying mock data...")
224
+ if copy_mock_data():
225
+ _last_fetch_time = datetime.now()
226
+ logger.info(f"✓ Using mock data (refreshed at {_last_fetch_time})")
227
+ _notify_refresh_callbacks()
228
+ return True
229
+ logger.error("Failed to refresh data from any source")
230
+ return False
231
+ finally:
232
+ _fetch_lock.release()
233
+
234
+
235
  def setup_mock_data():
236
  """
237
  Setup data for the leaderboard.
238
  First tries to fetch from GitHub, falls back to mock data if unavailable.
239
  """
240
+ global _last_fetch_time
241
+
242
  print("=" * 60)
243
  print("STARTING DATA SETUP")
244
  print("=" * 60)
245
 
246
  target_dir = Path(EXTRACTED_DATA_DIR) / CONFIG_NAME
247
 
248
+ # Check if data already exists and cache is not stale
249
+ if target_dir.exists():
250
+ results_dir = target_dir / "results"
251
+ has_results = results_dir.exists() and any(results_dir.iterdir())
252
+ has_jsonl = any(target_dir.glob("*.jsonl"))
253
+
254
+ if has_results or has_jsonl:
255
+ if not is_cache_stale():
256
+ print(f"Data already exists at {target_dir} and cache is fresh")
257
+ return
258
+ print(f"Data exists but cache is stale, will refresh...")
259
 
260
  # Try to fetch from GitHub first
261
  print("\n--- Attempting to fetch from GitHub ---")
262
  try:
263
  if fetch_data_from_github():
264
+ _last_fetch_time = datetime.now()
265
  print("✓ Successfully using data from GitHub repository")
266
  return
267
  except Exception as e:
 
271
  print("\n--- GitHub data not available, falling back to mock data ---")
272
  try:
273
  if copy_mock_data():
274
+ _last_fetch_time = datetime.now()
275
  print("✓ Successfully using mock data")
276
  return
277
  except Exception as e:
 
281
  print("ERROR: No data available! Neither GitHub nor mock data could be loaded.")
282
  print("!" * 60)
283
 
284
+
285
+ # Background refresh scheduler
286
+ _scheduler_thread = None
287
+ _scheduler_stop_event = threading.Event()
288
+
289
+
290
+ def _background_refresh_loop():
291
+ """Background thread that periodically refreshes data."""
292
+ global _scheduler_stop_event
293
+
294
+ logger.info(f"Background refresh scheduler started (interval: {CACHE_TTL_SECONDS}s)")
295
+
296
+ while not _scheduler_stop_event.is_set():
297
+ # Wait for the TTL interval (or until stopped)
298
+ if _scheduler_stop_event.wait(timeout=CACHE_TTL_SECONDS):
299
+ break # Stop event was set
300
+
301
+ # Try to refresh data
302
+ try:
303
+ logger.info("Background refresh triggered")
304
+ refresh_data_if_needed(force=True)
305
+ except Exception as e:
306
+ logger.error(f"Error in background refresh: {e}")
307
+
308
+ logger.info("Background refresh scheduler stopped")
309
+
310
+
311
+ def start_background_refresh():
312
+ """Start the background refresh scheduler."""
313
+ global _scheduler_thread, _scheduler_stop_event
314
+
315
+ if _scheduler_thread is not None and _scheduler_thread.is_alive():
316
+ logger.info("Background refresh scheduler already running")
317
+ return
318
+
319
+ _scheduler_stop_event.clear()
320
+ _scheduler_thread = threading.Thread(target=_background_refresh_loop, daemon=True)
321
+ _scheduler_thread.start()
322
+ logger.info("Background refresh scheduler started")
323
+
324
+
325
+ def stop_background_refresh():
326
+ """Stop the background refresh scheduler."""
327
+ global _scheduler_thread, _scheduler_stop_event
328
+
329
+ if _scheduler_thread is None:
330
+ return
331
+
332
+ _scheduler_stop_event.set()
333
+ _scheduler_thread.join(timeout=5)
334
+ _scheduler_thread = None
335
+ logger.info("Background refresh scheduler stopped")
336
+
337
+
338
  if __name__ == "__main__":
339
  setup_mock_data()
ui_components.py CHANGED
@@ -361,9 +361,30 @@ plot_legend_html = f"""
361
  </div>
362
  """;
363
 
364
- # --- Global State for Viewers (simple caching) ---
365
  CACHED_VIEWERS = {}
366
  CACHED_TAG_MAPS = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
367
 
368
 
369
  class DummyViewer:
@@ -378,14 +399,15 @@ class DummyViewer:
378
 
379
  def get_leaderboard_viewer_instance(split: str):
380
  """
381
- Fetches the LeaderboardViewer for a split, using a cache to avoid
382
  re-downloading data. On error, returns a stable DummyViewer object.
383
  """
384
  global CACHED_VIEWERS, CACHED_TAG_MAPS
385
 
386
- if split in CACHED_VIEWERS:
387
- # Cache hit: return the cached viewer and tag map
388
- return CACHED_VIEWERS[split], CACHED_TAG_MAPS.get(split, {"Overall": []})
 
389
 
390
  # --- Cache miss: try to load data from the source ---
391
  try:
@@ -402,9 +424,10 @@ def get_leaderboard_viewer_instance(split: str):
402
  # Simplify tag map creation
403
  pretty_tag_map = create_pretty_tag_map(viewer.tag_map, INFORMAL_TO_FORMAL_NAME_MAP)
404
 
405
- # Cache the results for next time
406
- CACHED_VIEWERS[split] = viewer
407
- CACHED_TAG_MAPS[split] = pretty_tag_map # Cache the pretty map directly
 
408
 
409
  return viewer, pretty_tag_map
410
 
@@ -418,8 +441,9 @@ def get_leaderboard_viewer_instance(split: str):
418
  dummy_tag_map = {"Overall": []}
419
 
420
  # Cache the dummy objects so we don't try to fetch again on this run
421
- CACHED_VIEWERS[split] = dummy_viewer
422
- CACHED_TAG_MAPS[split] = dummy_tag_map
 
423
 
424
  return dummy_viewer, dummy_tag_map
425
 
 
361
  </div>
362
  """;
363
 
364
+ # --- Global State for Viewers (simple caching with TTL) ---
365
  CACHED_VIEWERS = {}
366
  CACHED_TAG_MAPS = {}
367
+ _cache_lock = __import__('threading').Lock()
368
+
369
+
370
+ def clear_viewer_cache():
371
+ """
372
+ Clear all cached viewers and tag maps.
373
+ Called when data is refreshed from the background scheduler.
374
+ """
375
+ global CACHED_VIEWERS, CACHED_TAG_MAPS
376
+ with _cache_lock:
377
+ CACHED_VIEWERS.clear()
378
+ CACHED_TAG_MAPS.clear()
379
+ print("[CACHE] Viewer cache cleared after data refresh")
380
+
381
+
382
+ # Register the cache clear callback with the data refresh system
383
+ try:
384
+ from setup_data import register_refresh_callback
385
+ register_refresh_callback(clear_viewer_cache)
386
+ except ImportError:
387
+ pass # setup_data may not be available during import
388
 
389
 
390
  class DummyViewer:
 
399
 
400
  def get_leaderboard_viewer_instance(split: str):
401
  """
402
+ Fetches the LeaderboardViewer for a split, using a thread-safe cache to avoid
403
  re-downloading data. On error, returns a stable DummyViewer object.
404
  """
405
  global CACHED_VIEWERS, CACHED_TAG_MAPS
406
 
407
+ with _cache_lock:
408
+ if split in CACHED_VIEWERS:
409
+ # Cache hit: return the cached viewer and tag map
410
+ return CACHED_VIEWERS[split], CACHED_TAG_MAPS.get(split, {"Overall": []})
411
 
412
  # --- Cache miss: try to load data from the source ---
413
  try:
 
424
  # Simplify tag map creation
425
  pretty_tag_map = create_pretty_tag_map(viewer.tag_map, INFORMAL_TO_FORMAL_NAME_MAP)
426
 
427
+ # Cache the results for next time (thread-safe)
428
+ with _cache_lock:
429
+ CACHED_VIEWERS[split] = viewer
430
+ CACHED_TAG_MAPS[split] = pretty_tag_map # Cache the pretty map directly
431
 
432
  return viewer, pretty_tag_map
433
 
 
441
  dummy_tag_map = {"Overall": []}
442
 
443
  # Cache the dummy objects so we don't try to fetch again on this run
444
+ with _cache_lock:
445
+ CACHED_VIEWERS[split] = dummy_viewer
446
+ CACHED_TAG_MAPS[split] = dummy_tag_map
447
 
448
  return dummy_viewer, dummy_tag_map
449