dolev31 commited on
Commit
d7ede03
·
1 Parent(s): af68c03

Add data persistence via private HF dataset repo

Browse files

Runtime data now survives Space restarts:
- _restore_data_files() on startup downloads from dolev31/st-webagentbench-data
- _persist_file() uploads after every write (submissions, key requests, audit)
- Private dataset repo, owner-only access via HF_TOKEN

Files changed (2) hide show
  1. app.py +69 -0
  2. requirements.txt +1 -0
app.py CHANGED
@@ -26,6 +26,7 @@ from typing import List, Optional
26
 
27
  import gradio as gr
28
  from gradio.themes.utils import colors, fonts, sizes
 
29
  import pandas as pd
30
  import plotly.graph_objects as go
31
 
@@ -133,6 +134,7 @@ def _log_admin_action(action: str, details: str) -> None:
133
  }
134
  with open(ADMIN_AUDIT_FILE, "a") as f:
135
  f.write(json.dumps(record) + "\n")
 
136
 
137
 
138
  # Master secret env var name — used to derive per-user signing keys.
@@ -156,6 +158,66 @@ KEY_REQUESTS_FILE = Path("data/key_requests.jsonl")
156
  TASKS_FILE = Path("data/test.raw.json")
157
  CANONICAL_HASHES_FILE = Path("data/canonical_hashes.json")
158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  # Load canonical task definitions for validation
160
  _TASKS_DATA = None
161
  _CANONICAL_HASHES = None
@@ -233,6 +295,7 @@ def _log_key_request(email: str, team: str, institution: str) -> None:
233
  }
234
  with open(KEY_REQUESTS_FILE, "a") as f:
235
  f.write(json.dumps(record) + "\n")
 
236
 
237
 
238
  def _load_key_requests() -> list[dict]:
@@ -611,6 +674,7 @@ def save_submission(submission: dict) -> None:
611
  SUBMISSIONS_FILE.parent.mkdir(parents=True, exist_ok=True)
612
  with open(SUBMISSIONS_FILE, "a") as f:
613
  f.write(json.dumps(submission) + "\n")
 
614
 
615
 
616
  # ---------------------------------------------------------------------------
@@ -1146,6 +1210,7 @@ def admin_remove_submission(agent_id: str, session_token: str):
1146
  SUBMISSIONS_FILE.write_text(
1147
  "\n".join(json.dumps(s) for s in filtered) + ("\n" if filtered else "")
1148
  )
 
1149
  _log_admin_action("remove_submission", f"Removed {removed} submission(s) with agent_id={agent_id.strip()}")
1150
  return f"Removed {removed} submission(s) with agent_id '{agent_id}'."
1151
 
@@ -2081,6 +2146,10 @@ contact details.
2081
  return demo
2082
 
2083
 
 
 
 
 
2084
  if __name__ == "__main__":
2085
  app = create_app()
2086
  app.launch()
 
26
 
27
  import gradio as gr
28
  from gradio.themes.utils import colors, fonts, sizes
29
+ from huggingface_hub import HfApi
30
  import pandas as pd
31
  import plotly.graph_objects as go
32
 
 
134
  }
135
  with open(ADMIN_AUDIT_FILE, "a") as f:
136
  f.write(json.dumps(record) + "\n")
137
+ _persist_file(str(ADMIN_AUDIT_FILE), "admin_audit.jsonl")
138
 
139
 
140
  # Master secret env var name — used to derive per-user signing keys.
 
158
  TASKS_FILE = Path("data/test.raw.json")
159
  CANONICAL_HASHES_FILE = Path("data/canonical_hashes.json")
160
 
161
+
162
+ # ---------------------------------------------------------------------------
163
+ # Data persistence — external private dataset repo (survives Space restarts)
164
+ # ---------------------------------------------------------------------------
165
+
166
+ _DATA_REPO_ID = "dolev31/st-webagentbench-data"
167
+ _HF_API: HfApi | None = None
168
+
169
+
170
+ def _get_hf_api() -> HfApi | None:
171
+ """Lazy-init HfApi; returns None if no HF_TOKEN is available."""
172
+ global _HF_API
173
+ if _HF_API is not None:
174
+ return _HF_API
175
+ if os.environ.get("HF_TOKEN"):
176
+ _HF_API = HfApi()
177
+ return _HF_API
178
+ return None
179
+
180
+
181
+ def _persist_file(local_path: str, repo_path: str) -> None:
182
+ """Push a local file to the private dataset repo (no Space rebuild)."""
183
+ api = _get_hf_api()
184
+ if api is None:
185
+ return
186
+ try:
187
+ api.upload_file(
188
+ path_or_fileobj=local_path,
189
+ path_in_repo=repo_path,
190
+ repo_id=_DATA_REPO_ID,
191
+ repo_type="dataset",
192
+ commit_message=f"Auto-persist {repo_path}",
193
+ )
194
+ except Exception:
195
+ logger.warning("Failed to persist %s", repo_path, exc_info=True)
196
+
197
+
198
+ def _restore_data_files() -> None:
199
+ """On startup, download latest data files from the dataset repo."""
200
+ api = _get_hf_api()
201
+ if api is None:
202
+ logger.info("No HF_TOKEN — skipping data restore from dataset repo")
203
+ return
204
+ Path("data").mkdir(parents=True, exist_ok=True)
205
+ for filename in ["submissions.jsonl", "key_requests.jsonl", "admin_audit.jsonl"]:
206
+ local = Path("data") / filename
207
+ if local.exists() and local.stat().st_size > 0:
208
+ continue # Already has data (e.g., mid-session)
209
+ try:
210
+ api.hf_hub_download(
211
+ repo_id=_DATA_REPO_ID,
212
+ repo_type="dataset",
213
+ filename=filename,
214
+ local_dir="data",
215
+ )
216
+ logger.info("Restored %s from data repo", filename)
217
+ except Exception:
218
+ logger.info("No existing %s in data repo (first run?)", filename)
219
+
220
+
221
  # Load canonical task definitions for validation
222
  _TASKS_DATA = None
223
  _CANONICAL_HASHES = None
 
295
  }
296
  with open(KEY_REQUESTS_FILE, "a") as f:
297
  f.write(json.dumps(record) + "\n")
298
+ _persist_file(str(KEY_REQUESTS_FILE), "key_requests.jsonl")
299
 
300
 
301
  def _load_key_requests() -> list[dict]:
 
674
  SUBMISSIONS_FILE.parent.mkdir(parents=True, exist_ok=True)
675
  with open(SUBMISSIONS_FILE, "a") as f:
676
  f.write(json.dumps(submission) + "\n")
677
+ _persist_file(str(SUBMISSIONS_FILE), "submissions.jsonl")
678
 
679
 
680
  # ---------------------------------------------------------------------------
 
1210
  SUBMISSIONS_FILE.write_text(
1211
  "\n".join(json.dumps(s) for s in filtered) + ("\n" if filtered else "")
1212
  )
1213
+ _persist_file(str(SUBMISSIONS_FILE), "submissions.jsonl")
1214
  _log_admin_action("remove_submission", f"Removed {removed} submission(s) with agent_id={agent_id.strip()}")
1215
  return f"Removed {removed} submission(s) with agent_id '{agent_id}'."
1216
 
 
2146
  return demo
2147
 
2148
 
2149
+ # Restore persisted data on module load (runs on Space startup)
2150
+ _restore_data_files()
2151
+
2152
+
2153
  if __name__ == "__main__":
2154
  app = create_app()
2155
  app.launch()
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  gradio>=4.0
 
2
  pandas
3
  plotly
4
  pydantic>=2.0
 
1
  gradio>=4.0
2
+ huggingface_hub
3
  pandas
4
  plotly
5
  pydantic>=2.0