Spaces:
abc1181
/
Runtime error

abc1181 commited on
Commit
b6724be
Β·
verified Β·
1 Parent(s): c5c01a8

Create scripts/sync_hf.py

Browse files
Files changed (1) hide show
  1. scripts/sync_hf.py +149 -0
scripts/sync_hf.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ VS Code Space β€” HF Dataset Persistence
4
+ Restores /data on boot, auto-saves every 5 minutes
5
+ """
6
+
7
+ import os, sys, time, threading, shutil, traceback
8
+ from pathlib import Path
9
+ from datetime import datetime
10
+ from huggingface_hub import HfApi, snapshot_download
11
+
12
+ # ── Config ───────────────────────────────────────────────────────────────────
13
+
14
+ HF_TOKEN = os.environ.get("HF_TOKEN", "")
15
+ DATASET_REPO = os.environ.get("REPO", "") # e.g. abc1181/vscode-storage
16
+ SYNC_INTERVAL = int(os.environ.get("SYNC_INTERVAL", "300")) # 5 mins default
17
+ DATA_DIR = Path("/data")
18
+ PATH_IN_REPO = "workspace" # folder name inside dataset repo
19
+
20
+ IGNORE = [
21
+ "*.log", "*.lock", "*.tmp", "*.pid",
22
+ "__pycache__", "node_modules/**",
23
+ ".git/**", "*.pyc"
24
+ ]
25
+
26
+ # ── Setup ────────────────────────────────────────────────────────────────────
27
+
28
+ DATA_DIR.mkdir(parents=True, exist_ok=True)
29
+
30
+ if not HF_TOKEN:
31
+ print("[SYNC] WARNING: HF_TOKEN not set β€” persistence disabled")
32
+ sys.exit(0)
33
+
34
+ if not DATASET_REPO:
35
+ print("[SYNC] WARNING: REPO not set β€” persistence disabled")
36
+ sys.exit(0)
37
+
38
+ api = HfApi(token=HF_TOKEN)
39
+
40
+ # ── Ensure dataset repo exists ───────────────────────────────────────────────
41
+
42
+ def ensure_repo():
43
+ try:
44
+ api.repo_info(repo_id=DATASET_REPO, repo_type="dataset")
45
+ print(f"[SYNC] Dataset found: {DATASET_REPO}")
46
+ return True
47
+ except Exception:
48
+ try:
49
+ api.create_repo(repo_id=DATASET_REPO, repo_type="dataset", private=True)
50
+ print(f"[SYNC] Created dataset: {DATASET_REPO}")
51
+ return True
52
+ except Exception as e:
53
+ print(f"[SYNC] Failed to find/create dataset: {e}")
54
+ return False
55
+
56
+ # ── Restore /data from dataset on boot ───────────────────────────────────────
57
+
58
+ def restore():
59
+ print(f"[SYNC] Restoring /data from {DATASET_REPO}...")
60
+ try:
61
+ files = list(api.list_repo_files(repo_id=DATASET_REPO, repo_type="dataset"))
62
+ ws_files = [f for f in files if f.startswith(f"{PATH_IN_REPO}/")]
63
+
64
+ if not ws_files:
65
+ print("[SYNC] No files in dataset yet β€” starting fresh")
66
+ return
67
+
68
+ print(f"[SYNC] Found {len(ws_files)} files β€” downloading...")
69
+ import tempfile
70
+ with tempfile.TemporaryDirectory() as tmpdir:
71
+ snapshot_download(
72
+ repo_id=DATASET_REPO,
73
+ repo_type="dataset",
74
+ allow_patterns=f"{PATH_IN_REPO}/**",
75
+ local_dir=tmpdir,
76
+ token=HF_TOKEN,
77
+ )
78
+ src = Path(tmpdir) / PATH_IN_REPO
79
+ if src.exists():
80
+ for item in src.rglob("*"):
81
+ if item.is_file():
82
+ dest = DATA_DIR / item.relative_to(src)
83
+ dest.parent.mkdir(parents=True, exist_ok=True)
84
+ shutil.copy2(str(item), str(dest))
85
+ print("[SYNC] βœ… Restore complete!")
86
+
87
+ except Exception as e:
88
+ print(f"[SYNC] Restore failed: {e}")
89
+ traceback.print_exc()
90
+
91
+ # ── Save /data to dataset ─────────────────────────────────────────────────────
92
+
93
+ def save():
94
+ try:
95
+ file_count = sum(1 for _, _, fs in os.walk(DATA_DIR) for _ in fs)
96
+ if file_count == 0:
97
+ print("[SYNC] Nothing to save β€” /data is empty")
98
+ return
99
+
100
+ print(f"[SYNC] Uploading {file_count} files β†’ {DATASET_REPO}...")
101
+ api.upload_folder(
102
+ folder_path=str(DATA_DIR),
103
+ path_in_repo=PATH_IN_REPO,
104
+ repo_id=DATASET_REPO,
105
+ repo_type="dataset",
106
+ token=HF_TOKEN,
107
+ commit_message=f"autosave {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
108
+ ignore_patterns=IGNORE,
109
+ )
110
+ print(f"[SYNC] πŸ’Ύ Saved at {datetime.now().strftime('%H:%M:%S')}")
111
+
112
+ except Exception as e:
113
+ print(f"[SYNC] Save failed: {e}")
114
+ traceback.print_exc()
115
+
116
+ # ── Background sync loop ──────────────────────────────────────────────────────
117
+
118
+ def sync_loop(stop_event):
119
+ print(f"[SYNC] Auto-save loop started (every {SYNC_INTERVAL}s)")
120
+ while not stop_event.is_set():
121
+ if stop_event.wait(timeout=SYNC_INTERVAL):
122
+ break
123
+ print(f"[SYNC] Periodic save at {datetime.now().isoformat()}")
124
+ save()
125
+
126
+ # ── Main ──────────────────────────────────────────────────────────────────────
127
+
128
+ def main():
129
+ if not ensure_repo():
130
+ sys.exit(1)
131
+
132
+ restore()
133
+
134
+ stop_event = threading.Event()
135
+ t = threading.Thread(target=sync_loop, args=(stop_event,), daemon=True)
136
+ t.start()
137
+
138
+ # Keep alive β€” VS Code server runs separately
139
+ try:
140
+ while True:
141
+ time.sleep(60)
142
+ except KeyboardInterrupt:
143
+ print("[SYNC] Shutting down β€” final save...")
144
+ stop_event.set()
145
+ save()
146
+ print("[SYNC] Done.")
147
+
148
+ if __name__ == "__main__":
149
+ main()