#!/usr/bin/env python3 """ HuggingRun v2 — Ubuntu Server on HuggingFace Spaces. Persistence via HF Dataset (direct file sync, no archives): - Sync scope: /home, /root, /usr/local, /opt, /var/lib, /etc - System packages: saved as package name list, restored via apt install - Symlinks/permissions safe: no system binary dirs synced """ import http.server import json import os import shutil import signal import subprocess import sys import threading import time from datetime import datetime, timezone # ── Config ──────────────────────────────────────────────────────────── PERSIST_PATH = os.environ.get("PERSIST_PATH", "/data") HF_TOKEN = os.environ.get("HF_TOKEN", "") HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "") SYNC_INTERVAL = int(os.environ.get("SYNC_INTERVAL", "60")) SSH_PORT = os.environ.get("SSH_PORT", "2222") TTYD_PORT = os.environ.get("TTYD_PORT", "7681") LOGFILE = "/var/log/huggingrun.log" BASE_PKG_FILE = "/etc/base-packages.list" PKG_FILE = os.path.join(PERSIST_PATH, "user-packages.list") PIP_FILE = os.path.join(PERSIST_PATH, "pip-packages.txt") # Directories to sync (user data only, no system binaries) SYNC_DIRS = ["/home", "/root", "/usr/local", "/opt", "/var/lib", "/var/log", "/etc"] # Exclude from rsync (relative paths — rsync matches from transfer root) RSYNC_EXCLUDES = ["*.sock", "*.pid", "*.lock"] # Extra excludes only for /etc/ restore (container-managed files) ETC_RESTORE_EXCLUDES = [ "hostname", "hosts", "resolv.conf", "mtab", "fstab", "alternatives", # symlinks managed by dpkg "ld.so.cache", # rebuilt by ldconfig ] # Exclude from HF upload UPLOAD_IGNORE = [ "__pycache__", "*.pyc", ".git", ".git*", "*.sock", "*.lock", ".huggingface", ".cache", "huggingrun.env", # contains HF_TOKEN ] # ── Logging ─────────────────────────────────────────────────────────── def log(msg): ts = time.strftime("%H:%M:%S", time.gmtime()) line = f"[{ts}] {msg}" print(line, file=sys.stderr, flush=True) try: with open(LOGFILE, "a") as f: f.write(line + "\n") except Exception: pass def sh(cmd, log_err=False): """Run shell command, return (exit_code, stdout).""" r = subprocess.run(cmd, shell=True, capture_output=True, text=True) if log_err and r.returncode != 0 and r.stderr: log(f" cmd error: {r.stderr.strip()[:200]}") return r.returncode, (r.stdout or "").strip() # ── Config Resolution ───────────────────────────────────────────────── def resolve_config(): global HF_TOKEN, HF_DATASET_REPO HF_TOKEN = os.environ.get("HF_TOKEN", "") if not HF_DATASET_REPO: space_id = os.environ.get("SPACE_ID", "") if space_id: HF_DATASET_REPO = f"{space_id}-data" elif HF_TOKEN: try: from huggingface_hub import HfApi name = HfApi(token=HF_TOKEN).whoami()["name"] HF_DATASET_REPO = f"{name}/HuggingRun-data" except Exception: pass os.environ["HF_DATASET_REPO"] = HF_DATASET_REPO log("=" * 50) log("HuggingRun v2") log(f" HF_TOKEN: {'set' if HF_TOKEN else 'NOT SET'}") log(f" HF_DATASET_REPO: {HF_DATASET_REPO or 'NOT SET'}") log(f" SYNC_DIRS: {SYNC_DIRS}") log(f" SYNC_INTERVAL: {SYNC_INTERVAL}s") log("=" * 50) def ensure_dataset_repo(): if not HF_TOKEN or not HF_DATASET_REPO: return from huggingface_hub import HfApi api = HfApi(token=HF_TOKEN) try: api.repo_info(repo_id=HF_DATASET_REPO, repo_type="dataset") log(f"dataset: {HF_DATASET_REPO}") except Exception: api.create_repo(repo_id=HF_DATASET_REPO, repo_type="dataset", private=True) log(f"created dataset: {HF_DATASET_REPO}") # ── Restore ─────────────────────────────────────────────────────────── def restore(): """Download dataset → rsync to / → reinstall apt packages.""" if not HF_TOKEN or not HF_DATASET_REPO: return # Download dataset log("── RESTORE: downloading dataset") os.makedirs(PERSIST_PATH, exist_ok=True) t0 = time.time() try: from huggingface_hub import snapshot_download snapshot_download( repo_id=HF_DATASET_REPO, repo_type="dataset", local_dir=PERSIST_PATH, token=HF_TOKEN, ) log(f" downloaded ({time.time()-t0:.1f}s)") except Exception as e: log(f" download failed: {e}") return # Check if there's data to restore has_data = any( os.path.isdir(os.path.join(PERSIST_PATH, d.lstrip("/"))) for d in SYNC_DIRS ) if not has_data: log(" empty dataset, fresh start") return # Rsync each directory back (no --delete, don't remove existing files) log("── RESTORE: rsync /data/ → /") base_excludes = " ".join(f"--exclude='{e}'" for e in RSYNC_EXCLUDES) etc_excludes = " ".join(f"--exclude='{e}'" for e in ETC_RESTORE_EXCLUDES) for d in SYNC_DIRS: src = os.path.join(PERSIST_PATH, d.lstrip("/")) if not os.path.isdir(src): continue excludes = base_excludes if d == "/etc": excludes = f"{base_excludes} {etc_excludes}" cmd = f"rsync -rlptD {excludes} '{src}/' '{d}/'" rc, _ = sh(cmd, log_err=True) if rc == 0: log(f" {d}/ restored") else: log(f" {d}/ restore failed (rc={rc})") # Reinstall apt packages if os.path.exists(PKG_FILE) and os.path.exists(BASE_PKG_FILE): with open(BASE_PKG_FILE) as f: base = set(f.read().split()) with open(PKG_FILE) as f: saved = set(f.read().split()) to_install = sorted(saved - base) if to_install: log(f"── RESTORE: apt install {len(to_install)} packages") rc, _ = sh(f"apt-get update -qq && apt-get install -y --reinstall --no-install-recommends {' '.join(to_install)}") if rc == 0: log(f" packages restored") else: log(f" some packages failed") else: log("── RESTORE: no extra packages") else: log("── RESTORE: no package list") # Reinstall pip packages if os.path.exists(PIP_FILE): with open(PIP_FILE) as f: pip_pkgs = [l.strip() for l in f if l.strip() and not l.startswith('#')] if pip_pkgs: log(f"── RESTORE: pip install {len(pip_pkgs)} packages") rc, _ = sh(f"pip install --break-system-packages -q -r '{PIP_FILE}'") if rc == 0: log(f" pip packages restored") else: log(f" some pip packages failed") # Fix SSH key permissions (HF dataset doesn't preserve permissions) sh("chmod 600 /etc/ssh/ssh_host_*_key 2>/dev/null") sh("chmod 644 /etc/ssh/ssh_host_*_key.pub 2>/dev/null") # Restart sshd to pick up restored host keys sh("pkill sshd") time.sleep(1) os.makedirs("/run/sshd", exist_ok=True) proc = subprocess.Popen([ "/usr/sbin/sshd", "-D", "-e", "-o", f"Port={SSH_PORT}", "-o", "ListenAddress=127.0.0.1", "-o", "PermitRootLogin=yes", "-o", "PasswordAuthentication=yes", "-o", "PermitEmptyPasswords=no", "-o", "UsePAM=yes", ]) time.sleep(0.5) if proc.poll() is None: log(f" sshd restarted ok PID={proc.pid}") else: log(f" sshd restart FAILED (exit={proc.poll()})") # Fix up after restore sh('ldconfig 2>/dev/null') log("── RESTORE: complete") # ── Save + Upload ───────────────────────────────────────────────────── def save_and_upload(): """Rsync sync dirs to /data/, then upload to HF dataset.""" if not HF_TOKEN or not HF_DATASET_REPO: return from huggingface_hub import HfApi log("── SYNC: start") t0_total = time.time() # Save apt package list rc, out = sh("dpkg-query -W -f='${Package}\\n'") if rc == 0 and out: with open(PKG_FILE, "w") as f: f.write(out + "\n") # Save pip package list (user-installed only, exclude base) rc, out = sh("pip freeze --exclude-editable 2>/dev/null") if rc == 0 and out: with open(PIP_FILE, "w") as f: f.write(out + "\n") # Rsync each sync dir → /data/ excludes = " ".join(f"--exclude='{e}'" for e in RSYNC_EXCLUDES) for d in SYNC_DIRS: if not os.path.isdir(d): continue dst = os.path.join(PERSIST_PATH, d.lstrip("/")) os.makedirs(dst, exist_ok=True) cmd = f"rsync -rlptD --delete {excludes} '{d}/' '{dst}/'" sh(cmd) # Clean .cache dirs (HF API rejects .cache paths) for dirpath, dirnames, _ in os.walk(PERSIST_PATH): for dn in list(dirnames): if dn == ".cache": shutil.rmtree(os.path.join(dirpath, dn), ignore_errors=True) dirnames.remove(dn) # Upload each directory to HF dataset api = HfApi(token=HF_TOKEN) ts = time.strftime("%Y-%m-%d %H:%M", time.gmtime()) ok = 0 fail = 0 # Upload package list files + log for pf, name in [(PKG_FILE, "user-packages.list"), (PIP_FILE, "pip-packages.txt"), (LOGFILE, "huggingrun.log")]: if os.path.exists(pf): try: api.upload_file( path_or_fileobj=pf, path_in_repo=name, repo_id=HF_DATASET_REPO, repo_type="dataset", commit_message=f"sync {ts}: {name}", ) except Exception: pass # Upload each sync dir for d in SYNC_DIRS: local = os.path.join(PERSIST_PATH, d.lstrip("/")) if not os.path.isdir(local): continue repo_path = d.lstrip("/") # Count files try: fc = int(subprocess.check_output( f"find '{local}' -type f | wc -l", shell=True, text=True).strip()) except Exception: fc = 0 if fc == 0: continue # Upload with retry for attempt in range(3): t0 = time.time() try: api.upload_folder( folder_path=local, repo_id=HF_DATASET_REPO, repo_type="dataset", path_in_repo=repo_path, commit_message=f"sync {ts}: {repo_path}/", ignore_patterns=UPLOAD_IGNORE, ) log(f" {repo_path}/ ok ({time.time()-t0:.1f}s, {fc} files)") ok += 1 break except Exception as e: err = str(e).split('\n')[0][:100] if attempt < 2: log(f" {repo_path}/ retry {attempt+1}: {err}") time.sleep((attempt + 1) * 5) else: log(f" {repo_path}/ FAILED: {err}") fail += 1 time.sleep(2) # rate limit between dirs elapsed = time.time() - t0_total log(f"── SYNC: done ({elapsed:.1f}s) {ok} ok, {fail} failed") # ── Background Threads ──────────────────────────────────────────────── restore_done = threading.Event() def background_restore(): try: restore() ensure_passwords() except Exception as e: log(f"── RESTORE error: {e}") finally: restore_done.set() def sync_loop(): restore_done.wait() log("sync: ready, first sync in 30s") time.sleep(30) cycle = 0 while True: cycle += 1 log(f"── sync #{cycle}") try: save_and_upload() except Exception as e: log(f" sync error: {e}") time.sleep(SYNC_INTERVAL) def heartbeat_loop(): while True: time.sleep(60) try: _, load = sh("cat /proc/loadavg | cut -d' ' -f1-3") _, mem = sh("free -h | grep Mem: | tr -s ' ' | cut -d' ' -f2,3 | sed 's/ /\\//'") log(f"heartbeat: load={load} mem={mem}") except Exception: pass # ── Log Streamer (SSE on port 7863) ────────────────────────────────── class LogSSEHandler(http.server.BaseHTTPRequestHandler): def log_message(self, *a): pass def do_GET(self): if self.path != "/stream": self.send_response(404) self.end_headers() return self.send_response(200) self.send_header("Content-Type", "text/event-stream") self.send_header("Cache-Control", "no-cache") self.send_header("Access-Control-Allow-Origin", "*") self.end_headers() try: with open(LOGFILE) as f: # Send existing log for line in f: line = line.rstrip("\n") if line: ev = json.dumps({"data": line}) self.wfile.write(f"data: {ev}\n\n".encode()) self.wfile.flush() # Tail new lines while True: line = f.readline() if line: line = line.rstrip("\n") if line: ev = json.dumps({"data": line}) self.wfile.write(f"data: {ev}\n\n".encode()) self.wfile.flush() else: self.wfile.write(b": keepalive\n\n") self.wfile.flush() time.sleep(1) except (BrokenPipeError, ConnectionResetError): pass # ── Service Management ──────────────────────────────────────────────── def ensure_passwords(): sh("id tao-shen >/dev/null 2>&1 || useradd -m -s /bin/bash tao-shen") sh('echo "tao-shen:huggingrun" | chpasswd') sh('echo "root:huggingrun" | chpasswd') sh("usermod -aG sudo tao-shen 2>/dev/null || true") sh("echo 'tao-shen ALL=(ALL) NOPASSWD:ALL' > /etc/sudoers.d/tao-shen") sh("chmod 440 /etc/sudoers.d/tao-shen") # Fix home dir ownership (restore may bring old uid mapping) sh("chown tao-shen:tao-shen /home/tao-shen") # Copy root's SSH keys to tao-shen if needed if os.path.exists("/root/.ssh/authorized_keys"): sh("mkdir -p /home/tao-shen/.ssh") sh("cp -n /root/.ssh/authorized_keys /home/tao-shen/.ssh/authorized_keys 2>/dev/null || true") sh("chown -R tao-shen:tao-shen /home/tao-shen/.ssh") sh("chmod 700 /home/tao-shen/.ssh && chmod 600 /home/tao-shen/.ssh/authorized_keys") sh("ldconfig 2>/dev/null || true") def start_sshd(): os.makedirs("/run/sshd", exist_ok=True) proc = subprocess.Popen([ "/usr/sbin/sshd", "-D", "-e", "-o", f"Port={SSH_PORT}", "-o", "ListenAddress=127.0.0.1", "-o", "PermitRootLogin=yes", "-o", "PasswordAuthentication=yes", "-o", "PermitEmptyPasswords=no", "-o", "UsePAM=yes", ]) time.sleep(0.5) log(f"[OK] sshd PID={proc.pid}" if proc.poll() is None else "[FAIL] sshd") return proc def start_ttyd(): proc = subprocess.Popen([ "ttyd", "--port", TTYD_PORT, "--writable", "--base-path", "/", "login", "-f", "tao-shen", ]) time.sleep(0.5) log(f"[OK] ttyd PID={proc.pid}" if proc.poll() is None else "[FAIL] ttyd") return proc def start_ws_bridge(): proc = subprocess.Popen([sys.executable, "/ws_ssh_bridge.py"]) time.sleep(0.5) log(f"[OK] ws-bridge PID={proc.pid}" if proc.poll() is None else "[FAIL] ws-bridge") return proc def start_log_streamer(): srv = http.server.HTTPServer(("127.0.0.1", 7863), LogSSEHandler) threading.Thread(target=srv.serve_forever, daemon=True).start() log("[OK] log-streamer :7863") # ── Main ────────────────────────────────────────────────────────────── def main(): os.makedirs(PERSIST_PATH, exist_ok=True) open(LOGFILE, "a").close() sh("hostname huggingrun") resolve_config() ensure_dataset_repo() ensure_passwords() # Write env for other processes with open("/etc/huggingrun.env", "w") as f: f.write(f'export HF_TOKEN="{HF_TOKEN}"\n') f.write(f'export HF_DATASET_REPO="{HF_DATASET_REPO}"\n') # Start services (open port 7860 ASAP to avoid HF timeout) start_sshd() start_ws_bridge() start_ttyd() start_log_streamer() nginx_proc = subprocess.Popen( ["nginx", "-c", "/etc/nginx/nginx.conf", "-g", "daemon off;"] ) log(f"[OK] nginx PID={nginx_proc.pid}") log("=" * 50) log("READY — restore runs in background") log("=" * 50) # Background restore + sync threading.Thread(target=background_restore, daemon=True).start() threading.Thread(target=sync_loop, daemon=True).start() threading.Thread(target=heartbeat_loop, daemon=True).start() # Graceful shutdown def on_signal(sig, frame): log(f"signal {sig} — final save") nginx_proc.terminate() try: save_and_upload() except Exception as e: log(f"final save error: {e}") sys.exit(0) signal.signal(signal.SIGTERM, on_signal) signal.signal(signal.SIGINT, on_signal) nginx_proc.wait() if __name__ == "__main__": main()