File size: 5,744 Bytes
b6724be ec62f6a b6724be | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 | #!/usr/bin/env python3
"""
VS Code Space β HF Dataset Persistence
Restores /data on boot, auto-saves every 5 minutes
"""
import os, sys, time, threading, shutil, traceback
from pathlib import Path
from datetime import datetime
from huggingface_hub import HfApi, snapshot_download
# ββ Config βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
HF_TOKEN = os.environ.get("HF_TOKEN", "")
DATASET_REPO = os.environ.get("REPO", "") # e.g. abc1181/vscode-storage
SYNC_INTERVAL = int(os.environ.get("SYNC_INTERVAL", "300")) # 5 mins default
DATA_DIR = Path("/root/app/data")
PATH_IN_REPO = "workspace" # folder name inside dataset repo
IGNORE = [
"*.log", "*.lock", "*.tmp", "*.pid",
"__pycache__", "node_modules/**",
".git/**", "*.pyc"
]
# ββ Setup ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
DATA_DIR.mkdir(parents=True, exist_ok=True)
if not HF_TOKEN:
print("[SYNC] WARNING: HF_TOKEN not set β persistence disabled")
sys.exit(0)
if not DATASET_REPO:
print("[SYNC] WARNING: REPO not set β persistence disabled")
sys.exit(0)
api = HfApi(token=HF_TOKEN)
# ββ Ensure dataset repo exists βββββββββββββββββββββββββββββββββββββββββββββββ
def ensure_repo():
try:
api.repo_info(repo_id=DATASET_REPO, repo_type="dataset")
print(f"[SYNC] Dataset found: {DATASET_REPO}")
return True
except Exception:
try:
api.create_repo(repo_id=DATASET_REPO, repo_type="dataset", private=True)
print(f"[SYNC] Created dataset: {DATASET_REPO}")
return True
except Exception as e:
print(f"[SYNC] Failed to find/create dataset: {e}")
return False
# ββ Restore /data from dataset on boot βββββββββββββββββββββββββββββββββββββββ
def restore():
print(f"[SYNC] Restoring /data from {DATASET_REPO}...")
try:
files = list(api.list_repo_files(repo_id=DATASET_REPO, repo_type="dataset"))
ws_files = [f for f in files if f.startswith(f"{PATH_IN_REPO}/")]
if not ws_files:
print("[SYNC] No files in dataset yet β starting fresh")
return
print(f"[SYNC] Found {len(ws_files)} files β downloading...")
import tempfile
with tempfile.TemporaryDirectory() as tmpdir:
snapshot_download(
repo_id=DATASET_REPO,
repo_type="dataset",
allow_patterns=f"{PATH_IN_REPO}/**",
local_dir=tmpdir,
token=HF_TOKEN,
)
src = Path(tmpdir) / PATH_IN_REPO
if src.exists():
for item in src.rglob("*"):
if item.is_file():
dest = DATA_DIR / item.relative_to(src)
dest.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(str(item), str(dest))
print("[SYNC] β
Restore complete!")
except Exception as e:
print(f"[SYNC] Restore failed: {e}")
traceback.print_exc()
# ββ Save /data to dataset βββββββββββββββββββββββββββββββββββββββββββββββββββββ
def save():
try:
file_count = sum(1 for _, _, fs in os.walk(DATA_DIR) for _ in fs)
if file_count == 0:
print("[SYNC] Nothing to save β /data is empty")
return
print(f"[SYNC] Uploading {file_count} files β {DATASET_REPO}...")
api.upload_folder(
folder_path=str(DATA_DIR),
path_in_repo=PATH_IN_REPO,
repo_id=DATASET_REPO,
repo_type="dataset",
token=HF_TOKEN,
commit_message=f"autosave {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
ignore_patterns=IGNORE,
)
print(f"[SYNC] πΎ Saved at {datetime.now().strftime('%H:%M:%S')}")
except Exception as e:
print(f"[SYNC] Save failed: {e}")
traceback.print_exc()
# ββ Background sync loop ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def sync_loop(stop_event):
print(f"[SYNC] Auto-save loop started (every {SYNC_INTERVAL}s)")
while not stop_event.is_set():
if stop_event.wait(timeout=SYNC_INTERVAL):
break
print(f"[SYNC] Periodic save at {datetime.now().isoformat()}")
save()
# ββ Main ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def main():
if not ensure_repo():
sys.exit(1)
restore()
stop_event = threading.Event()
t = threading.Thread(target=sync_loop, args=(stop_event,), daemon=True)
t.start()
# Keep alive β VS Code server runs separately
try:
while True:
time.sleep(60)
except KeyboardInterrupt:
print("[SYNC] Shutting down β final save...")
stop_event.set()
save()
print("[SYNC] Done.")
if __name__ == "__main__":
main() |