| |
| """ |
| VS Code Space β HF Dataset Persistence |
| Restores /data on boot, auto-saves every 5 minutes |
| """ |
|
|
| import os, sys, time, threading, shutil, traceback |
| from pathlib import Path |
| from datetime import datetime |
| from huggingface_hub import HfApi, snapshot_download |
|
|
| |
|
|
| HF_TOKEN = os.environ.get("HF_TOKEN", "") |
| DATASET_REPO = os.environ.get("REPO", "") |
| SYNC_INTERVAL = int(os.environ.get("SYNC_INTERVAL", "300")) |
| DATA_DIR = Path("/root/app/data") |
| PATH_IN_REPO = "workspace" |
|
|
| IGNORE = [ |
| "*.log", "*.lock", "*.tmp", "*.pid", |
| "__pycache__", "node_modules/**", |
| ".git/**", "*.pyc" |
| ] |
|
|
| |
|
|
| DATA_DIR.mkdir(parents=True, exist_ok=True) |
|
|
| if not HF_TOKEN: |
| print("[SYNC] WARNING: HF_TOKEN not set β persistence disabled") |
| sys.exit(0) |
|
|
| if not DATASET_REPO: |
| print("[SYNC] WARNING: REPO not set β persistence disabled") |
| sys.exit(0) |
|
|
| api = HfApi(token=HF_TOKEN) |
|
|
| |
|
|
| def ensure_repo(): |
| try: |
| api.repo_info(repo_id=DATASET_REPO, repo_type="dataset") |
| print(f"[SYNC] Dataset found: {DATASET_REPO}") |
| return True |
| except Exception: |
| try: |
| api.create_repo(repo_id=DATASET_REPO, repo_type="dataset", private=True) |
| print(f"[SYNC] Created dataset: {DATASET_REPO}") |
| return True |
| except Exception as e: |
| print(f"[SYNC] Failed to find/create dataset: {e}") |
| return False |
|
|
| |
|
|
| def restore(): |
| print(f"[SYNC] Restoring /data from {DATASET_REPO}...") |
| try: |
| files = list(api.list_repo_files(repo_id=DATASET_REPO, repo_type="dataset")) |
| ws_files = [f for f in files if f.startswith(f"{PATH_IN_REPO}/")] |
|
|
| if not ws_files: |
| print("[SYNC] No files in dataset yet β starting fresh") |
| return |
|
|
| print(f"[SYNC] Found {len(ws_files)} files β downloading...") |
| import tempfile |
| with tempfile.TemporaryDirectory() as tmpdir: |
| snapshot_download( |
| repo_id=DATASET_REPO, |
| repo_type="dataset", |
| allow_patterns=f"{PATH_IN_REPO}/**", |
| local_dir=tmpdir, |
| token=HF_TOKEN, |
| ) |
| src = Path(tmpdir) / PATH_IN_REPO |
| if src.exists(): |
| for item in src.rglob("*"): |
| if item.is_file(): |
| dest = DATA_DIR / item.relative_to(src) |
| dest.parent.mkdir(parents=True, exist_ok=True) |
| shutil.copy2(str(item), str(dest)) |
| print("[SYNC] β
Restore complete!") |
|
|
| except Exception as e: |
| print(f"[SYNC] Restore failed: {e}") |
| traceback.print_exc() |
|
|
| |
|
|
| def save(): |
| try: |
| file_count = sum(1 for _, _, fs in os.walk(DATA_DIR) for _ in fs) |
| if file_count == 0: |
| print("[SYNC] Nothing to save β /data is empty") |
| return |
|
|
| print(f"[SYNC] Uploading {file_count} files β {DATASET_REPO}...") |
| api.upload_folder( |
| folder_path=str(DATA_DIR), |
| path_in_repo=PATH_IN_REPO, |
| repo_id=DATASET_REPO, |
| repo_type="dataset", |
| token=HF_TOKEN, |
| commit_message=f"autosave {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", |
| ignore_patterns=IGNORE, |
| ) |
| print(f"[SYNC] πΎ Saved at {datetime.now().strftime('%H:%M:%S')}") |
|
|
| except Exception as e: |
| print(f"[SYNC] Save failed: {e}") |
| traceback.print_exc() |
|
|
| |
|
|
| def sync_loop(stop_event): |
| print(f"[SYNC] Auto-save loop started (every {SYNC_INTERVAL}s)") |
| while not stop_event.is_set(): |
| if stop_event.wait(timeout=SYNC_INTERVAL): |
| break |
| print(f"[SYNC] Periodic save at {datetime.now().isoformat()}") |
| save() |
|
|
| |
|
|
| def main(): |
| if not ensure_repo(): |
| sys.exit(1) |
|
|
| restore() |
|
|
| stop_event = threading.Event() |
| t = threading.Thread(target=sync_loop, args=(stop_event,), daemon=True) |
| t.start() |
|
|
| |
| try: |
| while True: |
| time.sleep(60) |
| except KeyboardInterrupt: |
| print("[SYNC] Shutting down β final save...") |
| stop_event.set() |
| save() |
| print("[SYNC] Done.") |
|
|
| if __name__ == "__main__": |
| main() |