dolev31 commited on
Commit
a811e8c
Β·
1 Parent(s): 8a16b2b

Replace ad-hoc persistence with CommitScheduler

Browse files

The previous _persist_file() silently swallowed errors, so data was
never actually persisted to the HF dataset repo. Replace with
huggingface_hub CommitScheduler which auto-syncs data/ every 2 min.

- Remove _persist_file, _restore_data_files, _get_hf_api
- Add _init_persistence() with CommitScheduler setup
- Download existing data on startup, then schedule auto-commits
- Add persistence status banner in admin panel
- Add startup health log with record counts
- Update SDK version in README from 5.12.0 to 6.6.0

Files changed (2) hide show
  1. README.md +1 -1
  2. app.py +67 -58
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: πŸ›‘οΈ
4
  colorFrom: blue
5
  colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 5.12.0
8
  app_file: app.py
9
  pinned: true
10
  license: mit
 
4
  colorFrom: blue
5
  colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 6.6.0
8
  app_file: app.py
9
  pinned: true
10
  license: mit
app.py CHANGED
@@ -27,7 +27,7 @@ from typing import List, Optional
27
 
28
  import gradio as gr
29
  from gradio.themes.utils import colors, fonts, sizes
30
- from huggingface_hub import HfApi
31
  import pandas as pd
32
  import plotly.graph_objects as go
33
 
@@ -151,7 +151,6 @@ def _log_admin_action(action: str, details: str) -> None:
151
  }
152
  with open(ADMIN_AUDIT_FILE, "a") as f:
153
  f.write(json.dumps(record) + "\n")
154
- _persist_file(str(ADMIN_AUDIT_FILE), "admin_audit.jsonl")
155
 
156
 
157
  # Master secret env var name β€” used to derive per-user signing keys.
@@ -177,68 +176,60 @@ CANONICAL_HASHES_FILE = Path("data/canonical_hashes.json")
177
 
178
 
179
  # ---------------------------------------------------------------------------
180
- # Data persistence β€” external private dataset repo (survives Space restarts)
181
  # ---------------------------------------------------------------------------
182
 
183
  _DATA_REPO_ID = "dolev31/st-webagentbench-data"
184
- _HF_API: HfApi | None = None
 
 
185
 
186
 
187
- def _get_hf_api() -> HfApi | None:
188
- """Lazy-init HfApi; returns None if no usable token is found."""
189
- global _HF_API
190
- if _HF_API is not None:
191
- return _HF_API
192
- # HfApi auto-detects tokens from HF_TOKEN, HUGGING_FACE_HUB_TOKEN,
193
- # or the cached login token. Create it and check if a token is available.
194
- api = HfApi()
195
- token = api.token
196
- if token:
197
- _HF_API = api
198
- logger.info("HfApi initialized (token available)")
199
- return _HF_API
200
- logger.warning("No HF token found β€” data persistence disabled")
201
- return None
202
 
 
 
 
 
203
 
204
- def _persist_file(local_path: str, repo_path: str) -> None:
205
- """Push a local file to the private dataset repo (no Space rebuild)."""
206
- api = _get_hf_api()
207
- if api is None:
208
- return
209
  try:
210
- api.upload_file(
211
- path_or_fileobj=local_path,
212
- path_in_repo=repo_path,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  repo_id=_DATA_REPO_ID,
 
 
214
  repo_type="dataset",
215
- commit_message=f"Auto-persist {repo_path}",
 
 
 
216
  )
 
 
 
 
 
217
  except Exception:
218
- logger.warning("Failed to persist %s", repo_path, exc_info=True)
219
-
220
-
221
- def _restore_data_files() -> None:
222
- """On startup, download latest data files from the dataset repo."""
223
- api = _get_hf_api()
224
- if api is None:
225
- logger.info("No HF_TOKEN β€” skipping data restore from dataset repo")
226
- return
227
- Path("data").mkdir(parents=True, exist_ok=True)
228
- for filename in ["submissions.jsonl", "key_requests.jsonl", "admin_audit.jsonl"]:
229
- local = Path("data") / filename
230
- if local.exists() and local.stat().st_size > 0:
231
- continue # Already has data (e.g., mid-session)
232
- try:
233
- api.hf_hub_download(
234
- repo_id=_DATA_REPO_ID,
235
- repo_type="dataset",
236
- filename=filename,
237
- local_dir="data",
238
- )
239
- logger.info("Restored %s from data repo", filename)
240
- except Exception:
241
- logger.info("No existing %s in data repo (first run?)", filename)
242
 
243
 
244
  # Load canonical task definitions for validation
@@ -318,7 +309,6 @@ def _log_key_request(email: str, team: str, institution: str) -> None:
318
  }
319
  with open(KEY_REQUESTS_FILE, "a") as f:
320
  f.write(json.dumps(record) + "\n")
321
- _persist_file(str(KEY_REQUESTS_FILE), "key_requests.jsonl")
322
 
323
 
324
  def _load_key_requests() -> list[dict]:
@@ -716,7 +706,6 @@ def save_submission(submission: dict) -> None:
716
  SUBMISSIONS_FILE.parent.mkdir(parents=True, exist_ok=True)
717
  with open(SUBMISSIONS_FILE, "a") as f:
718
  f.write(json.dumps(submission) + "\n")
719
- _persist_file(str(SUBMISSIONS_FILE), "submissions.jsonl")
720
 
721
 
722
  # ---------------------------------------------------------------------------
@@ -1292,7 +1281,6 @@ def admin_remove_submission(agent_id: str, session_token: str):
1292
  SUBMISSIONS_FILE.write_text(
1293
  "\n".join(json.dumps(s) for s in filtered) + ("\n" if filtered else "")
1294
  )
1295
- _persist_file(str(SUBMISSIONS_FILE), "submissions.jsonl")
1296
  _log_admin_action("remove_submission", f"Removed {removed} submission(s) with agent_id={agent_id.strip()}")
1297
  return f"Removed {removed} submission(s) with agent_id '{agent_id}'."
1298
 
@@ -2159,7 +2147,14 @@ contact details.
2159
 
2160
  # Admin controls β€” hidden until login succeeds
2161
  with gr.Column(visible=False) as admin_panel:
2162
- gr.Markdown("---\n*Session active. All actions below are authenticated.*")
 
 
 
 
 
 
 
2163
 
2164
  with gr.Accordion("Remove Submission", open=True):
2165
  admin_agent_id = gr.Textbox(label="Agent ID to remove")
@@ -2231,8 +2226,22 @@ contact details.
2231
  return demo
2232
 
2233
 
2234
- # Restore persisted data on module load (runs on Space startup)
2235
- _restore_data_files()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2236
 
2237
 
2238
  if __name__ == "__main__":
 
27
 
28
  import gradio as gr
29
  from gradio.themes.utils import colors, fonts, sizes
30
+ from huggingface_hub import CommitScheduler, HfApi
31
  import pandas as pd
32
  import plotly.graph_objects as go
33
 
 
151
  }
152
  with open(ADMIN_AUDIT_FILE, "a") as f:
153
  f.write(json.dumps(record) + "\n")
 
154
 
155
 
156
  # Master secret env var name β€” used to derive per-user signing keys.
 
176
 
177
 
178
  # ---------------------------------------------------------------------------
179
+ # Data persistence β€” CommitScheduler auto-syncs data/ to HF dataset repo
180
  # ---------------------------------------------------------------------------
181
 
182
  _DATA_REPO_ID = "dolev31/st-webagentbench-data"
183
+ _DATA_DIR = Path("data")
184
+ _scheduler: CommitScheduler | None = None
185
+ _PERSISTENCE_ENABLED = False
186
 
187
 
188
+ def _init_persistence() -> bool:
189
+ """Initialize CommitScheduler for data persistence. Returns True if enabled."""
190
+ global _scheduler
191
+ _DATA_DIR.mkdir(parents=True, exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
192
 
193
+ api = HfApi()
194
+ if not api.token:
195
+ logger.warning("No HF token found β€” data persistence disabled")
196
+ return False
197
 
 
 
 
 
 
198
  try:
199
+ # Download existing data files from the repo before starting the scheduler
200
+ for filename in ["submissions.jsonl", "key_requests.jsonl", "admin_audit.jsonl"]:
201
+ local = _DATA_DIR / filename
202
+ if not local.exists() or local.stat().st_size == 0:
203
+ try:
204
+ api.hf_hub_download(
205
+ repo_id=_DATA_REPO_ID,
206
+ repo_type="dataset",
207
+ filename=filename,
208
+ local_dir=str(_DATA_DIR),
209
+ )
210
+ logger.info("Restored %s from data repo", filename)
211
+ except Exception:
212
+ logger.info("No existing %s in data repo (first run?)", filename)
213
+
214
+ # Start the scheduler β€” auto-commits data/ every 2 minutes
215
+ _scheduler = CommitScheduler(
216
  repo_id=_DATA_REPO_ID,
217
+ folder_path=_DATA_DIR,
218
+ every=2,
219
  repo_type="dataset",
220
+ private=True,
221
+ allow_patterns=["*.jsonl"],
222
+ squash_history=True,
223
+ hf_api=api,
224
  )
225
+ logger.info(
226
+ "CommitScheduler started β€” persisting to %s every 2 min",
227
+ _DATA_REPO_ID,
228
+ )
229
+ return True
230
  except Exception:
231
+ logger.error("Failed to initialize persistence", exc_info=True)
232
+ return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
 
235
  # Load canonical task definitions for validation
 
309
  }
310
  with open(KEY_REQUESTS_FILE, "a") as f:
311
  f.write(json.dumps(record) + "\n")
 
312
 
313
 
314
  def _load_key_requests() -> list[dict]:
 
706
  SUBMISSIONS_FILE.parent.mkdir(parents=True, exist_ok=True)
707
  with open(SUBMISSIONS_FILE, "a") as f:
708
  f.write(json.dumps(submission) + "\n")
 
709
 
710
 
711
  # ---------------------------------------------------------------------------
 
1281
  SUBMISSIONS_FILE.write_text(
1282
  "\n".join(json.dumps(s) for s in filtered) + ("\n" if filtered else "")
1283
  )
 
1284
  _log_admin_action("remove_submission", f"Removed {removed} submission(s) with agent_id={agent_id.strip()}")
1285
  return f"Removed {removed} submission(s) with agent_id '{agent_id}'."
1286
 
 
2147
 
2148
  # Admin controls β€” hidden until login succeeds
2149
  with gr.Column(visible=False) as admin_panel:
2150
+ _persist_msg = (
2151
+ "Data persistence: **ACTIVE** β€” syncing to HF dataset every 2 min"
2152
+ if _PERSISTENCE_ENABLED
2153
+ else "Data persistence: **DISABLED** β€” no HF_TOKEN set, "
2154
+ "data will be lost on rebuild!"
2155
+ )
2156
+ gr.Markdown(f"---\n{_persist_msg}\n\n"
2157
+ f"*Session active. All actions below are authenticated.*")
2158
 
2159
  with gr.Accordion("Remove Submission", open=True):
2160
  admin_agent_id = gr.Textbox(label="Agent ID to remove")
 
2226
  return demo
2227
 
2228
 
2229
+ # Initialize data persistence on module load (runs on Space startup)
2230
+ _PERSISTENCE_ENABLED = _init_persistence()
2231
+
2232
+ if _PERSISTENCE_ENABLED:
2233
+ logger.info("Persistence OK β€” data will survive Space rebuilds")
2234
+ for _f in ["key_requests.jsonl", "submissions.jsonl", "admin_audit.jsonl"]:
2235
+ _p = _DATA_DIR / _f
2236
+ if _p.exists() and _p.stat().st_size > 0:
2237
+ _count = sum(1 for line in _p.read_text().strip().split("\n") if line.strip())
2238
+ logger.info(" %s: %d records", _f, _count)
2239
+ else:
2240
+ logger.error(
2241
+ "PERSISTENCE DISABLED β€” set HF_TOKEN as a Space secret with write "
2242
+ "access to %s",
2243
+ _DATA_REPO_ID,
2244
+ )
2245
 
2246
 
2247
  if __name__ == "__main__":