#!/usr/bin/env python3 """ VS Code Space — HF Dataset Persistence Restores /data on boot, auto-saves every 5 minutes """ import os, sys, time, threading, shutil, traceback from pathlib import Path from datetime import datetime from huggingface_hub import HfApi, snapshot_download # ── Config ─────────────────────────────────────────────────────────────────── HF_TOKEN = os.environ.get("HF_TOKEN", "") DATASET_REPO = os.environ.get("REPO", "") # e.g. abc1181/vscode-storage SYNC_INTERVAL = int(os.environ.get("SYNC_INTERVAL", "300")) # 5 mins default DATA_DIR = Path("/root/app/data") PATH_IN_REPO = "workspace" # folder name inside dataset repo IGNORE = [ "*.log", "*.lock", "*.tmp", "*.pid", "__pycache__", "node_modules/**", ".git/**", "*.pyc" ] # ── Setup ──────────────────────────────────────────────────────────────────── DATA_DIR.mkdir(parents=True, exist_ok=True) if not HF_TOKEN: print("[SYNC] WARNING: HF_TOKEN not set — persistence disabled") sys.exit(0) if not DATASET_REPO: print("[SYNC] WARNING: REPO not set — persistence disabled") sys.exit(0) api = HfApi(token=HF_TOKEN) # ── Ensure dataset repo exists ─────────────────────────────────────────────── def ensure_repo(): try: api.repo_info(repo_id=DATASET_REPO, repo_type="dataset") print(f"[SYNC] Dataset found: {DATASET_REPO}") return True except Exception: try: api.create_repo(repo_id=DATASET_REPO, repo_type="dataset", private=True) print(f"[SYNC] Created dataset: {DATASET_REPO}") return True except Exception as e: print(f"[SYNC] Failed to find/create dataset: {e}") return False # ── Restore /data from dataset on boot ─────────────────────────────────────── def restore(): print(f"[SYNC] Restoring /data from {DATASET_REPO}...") try: files = list(api.list_repo_files(repo_id=DATASET_REPO, repo_type="dataset")) ws_files = [f for f in files if f.startswith(f"{PATH_IN_REPO}/")] if not ws_files: print("[SYNC] No files in dataset yet — starting fresh") return print(f"[SYNC] Found {len(ws_files)} files — downloading...") import tempfile with tempfile.TemporaryDirectory() as tmpdir: snapshot_download( repo_id=DATASET_REPO, repo_type="dataset", allow_patterns=f"{PATH_IN_REPO}/**", local_dir=tmpdir, token=HF_TOKEN, ) src = Path(tmpdir) / PATH_IN_REPO if src.exists(): for item in src.rglob("*"): if item.is_file(): dest = DATA_DIR / item.relative_to(src) dest.parent.mkdir(parents=True, exist_ok=True) shutil.copy2(str(item), str(dest)) print("[SYNC] ✅ Restore complete!") except Exception as e: print(f"[SYNC] Restore failed: {e}") traceback.print_exc() # ── Save /data to dataset ───────────────────────────────────────────────────── def save(): try: file_count = sum(1 for _, _, fs in os.walk(DATA_DIR) for _ in fs) if file_count == 0: print("[SYNC] Nothing to save — /data is empty") return print(f"[SYNC] Uploading {file_count} files → {DATASET_REPO}...") api.upload_folder( folder_path=str(DATA_DIR), path_in_repo=PATH_IN_REPO, repo_id=DATASET_REPO, repo_type="dataset", token=HF_TOKEN, commit_message=f"autosave {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", ignore_patterns=IGNORE, ) print(f"[SYNC] 💾 Saved at {datetime.now().strftime('%H:%M:%S')}") except Exception as e: print(f"[SYNC] Save failed: {e}") traceback.print_exc() # ── Background sync loop ────────────────────────────────────────────────────── def sync_loop(stop_event): print(f"[SYNC] Auto-save loop started (every {SYNC_INTERVAL}s)") while not stop_event.is_set(): if stop_event.wait(timeout=SYNC_INTERVAL): break print(f"[SYNC] Periodic save at {datetime.now().isoformat()}") save() # ── Main ────────────────────────────────────────────────────────────────────── def main(): if not ensure_repo(): sys.exit(1) restore() stop_event = threading.Event() t = threading.Thread(target=sync_loop, args=(stop_event,), daemon=True) t.start() # Keep alive — VS Code server runs separately try: while True: time.sleep(60) except KeyboardInterrupt: print("[SYNC] Shutting down — final save...") stop_event.set() save() print("[SYNC] Done.") if __name__ == "__main__": main()