Add data persistence via private HF dataset repo
Browse filesRuntime data now survives Space restarts:
- _restore_data_files() on startup downloads from dolev31/st-webagentbench-data
- _persist_file() uploads after every write (submissions, key requests, audit)
- Private dataset repo, owner-only access via HF_TOKEN
- app.py +69 -0
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -26,6 +26,7 @@ from typing import List, Optional
|
|
| 26 |
|
| 27 |
import gradio as gr
|
| 28 |
from gradio.themes.utils import colors, fonts, sizes
|
|
|
|
| 29 |
import pandas as pd
|
| 30 |
import plotly.graph_objects as go
|
| 31 |
|
|
@@ -133,6 +134,7 @@ def _log_admin_action(action: str, details: str) -> None:
|
|
| 133 |
}
|
| 134 |
with open(ADMIN_AUDIT_FILE, "a") as f:
|
| 135 |
f.write(json.dumps(record) + "\n")
|
|
|
|
| 136 |
|
| 137 |
|
| 138 |
# Master secret env var name — used to derive per-user signing keys.
|
|
@@ -156,6 +158,66 @@ KEY_REQUESTS_FILE = Path("data/key_requests.jsonl")
|
|
| 156 |
TASKS_FILE = Path("data/test.raw.json")
|
| 157 |
CANONICAL_HASHES_FILE = Path("data/canonical_hashes.json")
|
| 158 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
# Load canonical task definitions for validation
|
| 160 |
_TASKS_DATA = None
|
| 161 |
_CANONICAL_HASHES = None
|
|
@@ -233,6 +295,7 @@ def _log_key_request(email: str, team: str, institution: str) -> None:
|
|
| 233 |
}
|
| 234 |
with open(KEY_REQUESTS_FILE, "a") as f:
|
| 235 |
f.write(json.dumps(record) + "\n")
|
|
|
|
| 236 |
|
| 237 |
|
| 238 |
def _load_key_requests() -> list[dict]:
|
|
@@ -611,6 +674,7 @@ def save_submission(submission: dict) -> None:
|
|
| 611 |
SUBMISSIONS_FILE.parent.mkdir(parents=True, exist_ok=True)
|
| 612 |
with open(SUBMISSIONS_FILE, "a") as f:
|
| 613 |
f.write(json.dumps(submission) + "\n")
|
|
|
|
| 614 |
|
| 615 |
|
| 616 |
# ---------------------------------------------------------------------------
|
|
@@ -1146,6 +1210,7 @@ def admin_remove_submission(agent_id: str, session_token: str):
|
|
| 1146 |
SUBMISSIONS_FILE.write_text(
|
| 1147 |
"\n".join(json.dumps(s) for s in filtered) + ("\n" if filtered else "")
|
| 1148 |
)
|
|
|
|
| 1149 |
_log_admin_action("remove_submission", f"Removed {removed} submission(s) with agent_id={agent_id.strip()}")
|
| 1150 |
return f"Removed {removed} submission(s) with agent_id '{agent_id}'."
|
| 1151 |
|
|
@@ -2081,6 +2146,10 @@ contact details.
|
|
| 2081 |
return demo
|
| 2082 |
|
| 2083 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2084 |
if __name__ == "__main__":
|
| 2085 |
app = create_app()
|
| 2086 |
app.launch()
|
|
|
|
| 26 |
|
| 27 |
import gradio as gr
|
| 28 |
from gradio.themes.utils import colors, fonts, sizes
|
| 29 |
+
from huggingface_hub import HfApi
|
| 30 |
import pandas as pd
|
| 31 |
import plotly.graph_objects as go
|
| 32 |
|
|
|
|
| 134 |
}
|
| 135 |
with open(ADMIN_AUDIT_FILE, "a") as f:
|
| 136 |
f.write(json.dumps(record) + "\n")
|
| 137 |
+
_persist_file(str(ADMIN_AUDIT_FILE), "admin_audit.jsonl")
|
| 138 |
|
| 139 |
|
| 140 |
# Master secret env var name — used to derive per-user signing keys.
|
|
|
|
| 158 |
TASKS_FILE = Path("data/test.raw.json")
|
| 159 |
CANONICAL_HASHES_FILE = Path("data/canonical_hashes.json")
|
| 160 |
|
| 161 |
+
|
| 162 |
+
# ---------------------------------------------------------------------------
|
| 163 |
+
# Data persistence — external private dataset repo (survives Space restarts)
|
| 164 |
+
# ---------------------------------------------------------------------------
|
| 165 |
+
|
| 166 |
+
_DATA_REPO_ID = "dolev31/st-webagentbench-data"
|
| 167 |
+
_HF_API: HfApi | None = None
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def _get_hf_api() -> HfApi | None:
|
| 171 |
+
"""Lazy-init HfApi; returns None if no HF_TOKEN is available."""
|
| 172 |
+
global _HF_API
|
| 173 |
+
if _HF_API is not None:
|
| 174 |
+
return _HF_API
|
| 175 |
+
if os.environ.get("HF_TOKEN"):
|
| 176 |
+
_HF_API = HfApi()
|
| 177 |
+
return _HF_API
|
| 178 |
+
return None
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def _persist_file(local_path: str, repo_path: str) -> None:
|
| 182 |
+
"""Push a local file to the private dataset repo (no Space rebuild)."""
|
| 183 |
+
api = _get_hf_api()
|
| 184 |
+
if api is None:
|
| 185 |
+
return
|
| 186 |
+
try:
|
| 187 |
+
api.upload_file(
|
| 188 |
+
path_or_fileobj=local_path,
|
| 189 |
+
path_in_repo=repo_path,
|
| 190 |
+
repo_id=_DATA_REPO_ID,
|
| 191 |
+
repo_type="dataset",
|
| 192 |
+
commit_message=f"Auto-persist {repo_path}",
|
| 193 |
+
)
|
| 194 |
+
except Exception:
|
| 195 |
+
logger.warning("Failed to persist %s", repo_path, exc_info=True)
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
def _restore_data_files() -> None:
|
| 199 |
+
"""On startup, download latest data files from the dataset repo."""
|
| 200 |
+
api = _get_hf_api()
|
| 201 |
+
if api is None:
|
| 202 |
+
logger.info("No HF_TOKEN — skipping data restore from dataset repo")
|
| 203 |
+
return
|
| 204 |
+
Path("data").mkdir(parents=True, exist_ok=True)
|
| 205 |
+
for filename in ["submissions.jsonl", "key_requests.jsonl", "admin_audit.jsonl"]:
|
| 206 |
+
local = Path("data") / filename
|
| 207 |
+
if local.exists() and local.stat().st_size > 0:
|
| 208 |
+
continue # Already has data (e.g., mid-session)
|
| 209 |
+
try:
|
| 210 |
+
api.hf_hub_download(
|
| 211 |
+
repo_id=_DATA_REPO_ID,
|
| 212 |
+
repo_type="dataset",
|
| 213 |
+
filename=filename,
|
| 214 |
+
local_dir="data",
|
| 215 |
+
)
|
| 216 |
+
logger.info("Restored %s from data repo", filename)
|
| 217 |
+
except Exception:
|
| 218 |
+
logger.info("No existing %s in data repo (first run?)", filename)
|
| 219 |
+
|
| 220 |
+
|
| 221 |
# Load canonical task definitions for validation
|
| 222 |
_TASKS_DATA = None
|
| 223 |
_CANONICAL_HASHES = None
|
|
|
|
| 295 |
}
|
| 296 |
with open(KEY_REQUESTS_FILE, "a") as f:
|
| 297 |
f.write(json.dumps(record) + "\n")
|
| 298 |
+
_persist_file(str(KEY_REQUESTS_FILE), "key_requests.jsonl")
|
| 299 |
|
| 300 |
|
| 301 |
def _load_key_requests() -> list[dict]:
|
|
|
|
| 674 |
SUBMISSIONS_FILE.parent.mkdir(parents=True, exist_ok=True)
|
| 675 |
with open(SUBMISSIONS_FILE, "a") as f:
|
| 676 |
f.write(json.dumps(submission) + "\n")
|
| 677 |
+
_persist_file(str(SUBMISSIONS_FILE), "submissions.jsonl")
|
| 678 |
|
| 679 |
|
| 680 |
# ---------------------------------------------------------------------------
|
|
|
|
| 1210 |
SUBMISSIONS_FILE.write_text(
|
| 1211 |
"\n".join(json.dumps(s) for s in filtered) + ("\n" if filtered else "")
|
| 1212 |
)
|
| 1213 |
+
_persist_file(str(SUBMISSIONS_FILE), "submissions.jsonl")
|
| 1214 |
_log_admin_action("remove_submission", f"Removed {removed} submission(s) with agent_id={agent_id.strip()}")
|
| 1215 |
return f"Removed {removed} submission(s) with agent_id '{agent_id}'."
|
| 1216 |
|
|
|
|
| 2146 |
return demo
|
| 2147 |
|
| 2148 |
|
| 2149 |
+
# Restore persisted data on module load (runs on Space startup)
|
| 2150 |
+
_restore_data_files()
|
| 2151 |
+
|
| 2152 |
+
|
| 2153 |
if __name__ == "__main__":
|
| 2154 |
app = create_app()
|
| 2155 |
app.launch()
|
requirements.txt
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
gradio>=4.0
|
|
|
|
| 2 |
pandas
|
| 3 |
plotly
|
| 4 |
pydantic>=2.0
|
|
|
|
| 1 |
gradio>=4.0
|
| 2 |
+
huggingface_hub
|
| 3 |
pandas
|
| 4 |
plotly
|
| 5 |
pydantic>=2.0
|