Spaces:
abc1181
/
Runtime error

code / scripts /sync_hf.py
abc1181's picture
Update scripts/sync_hf.py
ec62f6a verified
#!/usr/bin/env python3
"""
VS Code Space β€” HF Dataset Persistence
Restores /data on boot, auto-saves every 5 minutes
"""
import os, sys, time, threading, shutil, traceback
from pathlib import Path
from datetime import datetime
from huggingface_hub import HfApi, snapshot_download
# ── Config ───────────────────────────────────────────────────────────────────
HF_TOKEN = os.environ.get("HF_TOKEN", "")
DATASET_REPO = os.environ.get("REPO", "") # e.g. abc1181/vscode-storage
SYNC_INTERVAL = int(os.environ.get("SYNC_INTERVAL", "300")) # 5 mins default
DATA_DIR = Path("/root/app/data")
PATH_IN_REPO = "workspace" # folder name inside dataset repo
IGNORE = [
"*.log", "*.lock", "*.tmp", "*.pid",
"__pycache__", "node_modules/**",
".git/**", "*.pyc"
]
# ── Setup ────────────────────────────────────────────────────────────────────
DATA_DIR.mkdir(parents=True, exist_ok=True)
if not HF_TOKEN:
print("[SYNC] WARNING: HF_TOKEN not set β€” persistence disabled")
sys.exit(0)
if not DATASET_REPO:
print("[SYNC] WARNING: REPO not set β€” persistence disabled")
sys.exit(0)
api = HfApi(token=HF_TOKEN)
# ── Ensure dataset repo exists ───────────────────────────────────────────────
def ensure_repo():
try:
api.repo_info(repo_id=DATASET_REPO, repo_type="dataset")
print(f"[SYNC] Dataset found: {DATASET_REPO}")
return True
except Exception:
try:
api.create_repo(repo_id=DATASET_REPO, repo_type="dataset", private=True)
print(f"[SYNC] Created dataset: {DATASET_REPO}")
return True
except Exception as e:
print(f"[SYNC] Failed to find/create dataset: {e}")
return False
# ── Restore /data from dataset on boot ───────────────────────────────────────
def restore():
print(f"[SYNC] Restoring /data from {DATASET_REPO}...")
try:
files = list(api.list_repo_files(repo_id=DATASET_REPO, repo_type="dataset"))
ws_files = [f for f in files if f.startswith(f"{PATH_IN_REPO}/")]
if not ws_files:
print("[SYNC] No files in dataset yet β€” starting fresh")
return
print(f"[SYNC] Found {len(ws_files)} files β€” downloading...")
import tempfile
with tempfile.TemporaryDirectory() as tmpdir:
snapshot_download(
repo_id=DATASET_REPO,
repo_type="dataset",
allow_patterns=f"{PATH_IN_REPO}/**",
local_dir=tmpdir,
token=HF_TOKEN,
)
src = Path(tmpdir) / PATH_IN_REPO
if src.exists():
for item in src.rglob("*"):
if item.is_file():
dest = DATA_DIR / item.relative_to(src)
dest.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(str(item), str(dest))
print("[SYNC] βœ… Restore complete!")
except Exception as e:
print(f"[SYNC] Restore failed: {e}")
traceback.print_exc()
# ── Save /data to dataset ─────────────────────────────────────────────────────
def save():
try:
file_count = sum(1 for _, _, fs in os.walk(DATA_DIR) for _ in fs)
if file_count == 0:
print("[SYNC] Nothing to save β€” /data is empty")
return
print(f"[SYNC] Uploading {file_count} files β†’ {DATASET_REPO}...")
api.upload_folder(
folder_path=str(DATA_DIR),
path_in_repo=PATH_IN_REPO,
repo_id=DATASET_REPO,
repo_type="dataset",
token=HF_TOKEN,
commit_message=f"autosave {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
ignore_patterns=IGNORE,
)
print(f"[SYNC] πŸ’Ύ Saved at {datetime.now().strftime('%H:%M:%S')}")
except Exception as e:
print(f"[SYNC] Save failed: {e}")
traceback.print_exc()
# ── Background sync loop ──────────────────────────────────────────────────────
def sync_loop(stop_event):
print(f"[SYNC] Auto-save loop started (every {SYNC_INTERVAL}s)")
while not stop_event.is_set():
if stop_event.wait(timeout=SYNC_INTERVAL):
break
print(f"[SYNC] Periodic save at {datetime.now().isoformat()}")
save()
# ── Main ──────────────────────────────────────────────────────────────────────
def main():
if not ensure_repo():
sys.exit(1)
restore()
stop_event = threading.Event()
t = threading.Thread(target=sync_loop, args=(stop_event,), daemon=True)
t.start()
# Keep alive β€” VS Code server runs separately
try:
while True:
time.sleep(60)
except KeyboardInterrupt:
print("[SYNC] Shutting down β€” final save...")
stop_event.set()
save()
print("[SYNC] Done.")
if __name__ == "__main__":
main()