Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| HuggingRun v2 β Ubuntu Server on HuggingFace Spaces. | |
| Persistence via HF Dataset (direct file sync, no archives): | |
| - Sync scope: /home, /root, /usr/local, /opt, /var/lib, /etc | |
| - System packages: saved as package name list, restored via apt install | |
| - Symlinks/permissions safe: no system binary dirs synced | |
| """ | |
| import http.server | |
| import json | |
| import os | |
| import shutil | |
| import signal | |
| import subprocess | |
| import sys | |
| import threading | |
| import time | |
| from datetime import datetime, timezone | |
| # ββ Config ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| PERSIST_PATH = os.environ.get("PERSIST_PATH", "/data") | |
| HF_TOKEN = os.environ.get("HF_TOKEN", "") | |
| HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "") | |
| SYNC_INTERVAL = int(os.environ.get("SYNC_INTERVAL", "60")) | |
| SSH_PORT = os.environ.get("SSH_PORT", "2222") | |
| TTYD_PORT = os.environ.get("TTYD_PORT", "7681") | |
| LOGFILE = "/var/log/huggingrun.log" | |
| BASE_PKG_FILE = "/etc/base-packages.list" | |
| PKG_FILE = os.path.join(PERSIST_PATH, "user-packages.list") | |
| PIP_FILE = os.path.join(PERSIST_PATH, "pip-packages.txt") | |
| # Directories to sync (user data only, no system binaries) | |
| SYNC_DIRS = ["/home", "/root", "/usr/local", "/opt", "/var/lib", "/var/log", "/etc"] | |
| # Exclude from rsync (relative paths β rsync matches from transfer root) | |
| RSYNC_EXCLUDES = ["*.sock", "*.pid", "*.lock"] | |
| # Extra excludes only for /etc/ restore (container-managed files) | |
| ETC_RESTORE_EXCLUDES = [ | |
| "hostname", "hosts", "resolv.conf", "mtab", "fstab", | |
| "alternatives", # symlinks managed by dpkg | |
| "ld.so.cache", # rebuilt by ldconfig | |
| ] | |
| # Exclude from HF upload | |
| UPLOAD_IGNORE = [ | |
| "__pycache__", "*.pyc", ".git", ".git*", | |
| "*.sock", "*.lock", ".huggingface", ".cache", | |
| "huggingrun.env", # contains HF_TOKEN | |
| ] | |
| # ββ Logging βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def log(msg): | |
| ts = time.strftime("%H:%M:%S", time.gmtime()) | |
| line = f"[{ts}] {msg}" | |
| print(line, file=sys.stderr, flush=True) | |
| try: | |
| with open(LOGFILE, "a") as f: | |
| f.write(line + "\n") | |
| except Exception: | |
| pass | |
| def sh(cmd, log_err=False): | |
| """Run shell command, return (exit_code, stdout).""" | |
| r = subprocess.run(cmd, shell=True, capture_output=True, text=True) | |
| if log_err and r.returncode != 0 and r.stderr: | |
| log(f" cmd error: {r.stderr.strip()[:200]}") | |
| return r.returncode, (r.stdout or "").strip() | |
| # ββ Config Resolution βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def resolve_config(): | |
| global HF_TOKEN, HF_DATASET_REPO | |
| HF_TOKEN = os.environ.get("HF_TOKEN", "") | |
| if not HF_DATASET_REPO: | |
| space_id = os.environ.get("SPACE_ID", "") | |
| if space_id: | |
| HF_DATASET_REPO = f"{space_id}-data" | |
| elif HF_TOKEN: | |
| try: | |
| from huggingface_hub import HfApi | |
| name = HfApi(token=HF_TOKEN).whoami()["name"] | |
| HF_DATASET_REPO = f"{name}/HuggingRun-data" | |
| except Exception: | |
| pass | |
| os.environ["HF_DATASET_REPO"] = HF_DATASET_REPO | |
| log("=" * 50) | |
| log("HuggingRun v2") | |
| log(f" HF_TOKEN: {'set' if HF_TOKEN else 'NOT SET'}") | |
| log(f" HF_DATASET_REPO: {HF_DATASET_REPO or 'NOT SET'}") | |
| log(f" SYNC_DIRS: {SYNC_DIRS}") | |
| log(f" SYNC_INTERVAL: {SYNC_INTERVAL}s") | |
| log("=" * 50) | |
| def ensure_dataset_repo(): | |
| if not HF_TOKEN or not HF_DATASET_REPO: | |
| return | |
| from huggingface_hub import HfApi | |
| api = HfApi(token=HF_TOKEN) | |
| try: | |
| api.repo_info(repo_id=HF_DATASET_REPO, repo_type="dataset") | |
| log(f"dataset: {HF_DATASET_REPO}") | |
| except Exception: | |
| api.create_repo(repo_id=HF_DATASET_REPO, repo_type="dataset", private=True) | |
| log(f"created dataset: {HF_DATASET_REPO}") | |
| # ββ Restore βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def restore(): | |
| """Download dataset β rsync to / β reinstall apt packages.""" | |
| if not HF_TOKEN or not HF_DATASET_REPO: | |
| return | |
| # Download dataset | |
| log("ββ RESTORE: downloading dataset") | |
| os.makedirs(PERSIST_PATH, exist_ok=True) | |
| t0 = time.time() | |
| try: | |
| from huggingface_hub import snapshot_download | |
| snapshot_download( | |
| repo_id=HF_DATASET_REPO, repo_type="dataset", | |
| local_dir=PERSIST_PATH, token=HF_TOKEN, | |
| ) | |
| log(f" downloaded ({time.time()-t0:.1f}s)") | |
| except Exception as e: | |
| log(f" download failed: {e}") | |
| return | |
| # Check if there's data to restore | |
| has_data = any( | |
| os.path.isdir(os.path.join(PERSIST_PATH, d.lstrip("/"))) | |
| for d in SYNC_DIRS | |
| ) | |
| if not has_data: | |
| log(" empty dataset, fresh start") | |
| return | |
| # Rsync each directory back (no --delete, don't remove existing files) | |
| log("ββ RESTORE: rsync /data/ β /") | |
| base_excludes = " ".join(f"--exclude='{e}'" for e in RSYNC_EXCLUDES) | |
| etc_excludes = " ".join(f"--exclude='{e}'" for e in ETC_RESTORE_EXCLUDES) | |
| for d in SYNC_DIRS: | |
| src = os.path.join(PERSIST_PATH, d.lstrip("/")) | |
| if not os.path.isdir(src): | |
| continue | |
| excludes = base_excludes | |
| if d == "/etc": | |
| excludes = f"{base_excludes} {etc_excludes}" | |
| cmd = f"rsync -rlptD {excludes} '{src}/' '{d}/'" | |
| rc, _ = sh(cmd, log_err=True) | |
| if rc == 0: | |
| log(f" {d}/ restored") | |
| else: | |
| log(f" {d}/ restore failed (rc={rc})") | |
| # Reinstall apt packages | |
| if os.path.exists(PKG_FILE) and os.path.exists(BASE_PKG_FILE): | |
| with open(BASE_PKG_FILE) as f: | |
| base = set(f.read().split()) | |
| with open(PKG_FILE) as f: | |
| saved = set(f.read().split()) | |
| to_install = sorted(saved - base) | |
| if to_install: | |
| log(f"ββ RESTORE: apt install {len(to_install)} packages") | |
| rc, _ = sh(f"apt-get update -qq && apt-get install -y --reinstall --no-install-recommends {' '.join(to_install)}") | |
| if rc == 0: | |
| log(f" packages restored") | |
| else: | |
| log(f" some packages failed") | |
| else: | |
| log("ββ RESTORE: no extra packages") | |
| else: | |
| log("ββ RESTORE: no package list") | |
| # Reinstall pip packages | |
| if os.path.exists(PIP_FILE): | |
| with open(PIP_FILE) as f: | |
| pip_pkgs = [l.strip() for l in f if l.strip() and not l.startswith('#')] | |
| if pip_pkgs: | |
| log(f"ββ RESTORE: pip install {len(pip_pkgs)} packages") | |
| rc, _ = sh(f"pip install --break-system-packages -q -r '{PIP_FILE}'") | |
| if rc == 0: | |
| log(f" pip packages restored") | |
| else: | |
| log(f" some pip packages failed") | |
| # Fix SSH key permissions (HF dataset doesn't preserve permissions) | |
| sh("chmod 600 /etc/ssh/ssh_host_*_key 2>/dev/null") | |
| sh("chmod 644 /etc/ssh/ssh_host_*_key.pub 2>/dev/null") | |
| # Restart sshd to pick up restored host keys | |
| sh("pkill sshd") | |
| time.sleep(1) | |
| os.makedirs("/run/sshd", exist_ok=True) | |
| proc = subprocess.Popen([ | |
| "/usr/sbin/sshd", "-D", "-e", | |
| "-o", f"Port={SSH_PORT}", | |
| "-o", "ListenAddress=127.0.0.1", | |
| "-o", "PermitRootLogin=yes", | |
| "-o", "PasswordAuthentication=yes", | |
| "-o", "PermitEmptyPasswords=no", | |
| "-o", "UsePAM=yes", | |
| ]) | |
| time.sleep(0.5) | |
| if proc.poll() is None: | |
| log(f" sshd restarted ok PID={proc.pid}") | |
| else: | |
| log(f" sshd restart FAILED (exit={proc.poll()})") | |
| # Fix up after restore | |
| sh('ldconfig 2>/dev/null') | |
| log("ββ RESTORE: complete") | |
| # ββ Save + Upload βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def save_and_upload(): | |
| """Rsync sync dirs to /data/, then upload to HF dataset.""" | |
| if not HF_TOKEN or not HF_DATASET_REPO: | |
| return | |
| from huggingface_hub import HfApi | |
| log("ββ SYNC: start") | |
| t0_total = time.time() | |
| # Save apt package list | |
| rc, out = sh("dpkg-query -W -f='${Package}\\n'") | |
| if rc == 0 and out: | |
| with open(PKG_FILE, "w") as f: | |
| f.write(out + "\n") | |
| # Save pip package list (user-installed only, exclude base) | |
| rc, out = sh("pip freeze --exclude-editable 2>/dev/null") | |
| if rc == 0 and out: | |
| with open(PIP_FILE, "w") as f: | |
| f.write(out + "\n") | |
| # Rsync each sync dir β /data/ | |
| excludes = " ".join(f"--exclude='{e}'" for e in RSYNC_EXCLUDES) | |
| for d in SYNC_DIRS: | |
| if not os.path.isdir(d): | |
| continue | |
| dst = os.path.join(PERSIST_PATH, d.lstrip("/")) | |
| os.makedirs(dst, exist_ok=True) | |
| cmd = f"rsync -rlptD --delete {excludes} '{d}/' '{dst}/'" | |
| sh(cmd) | |
| # Clean .cache dirs (HF API rejects .cache paths) | |
| for dirpath, dirnames, _ in os.walk(PERSIST_PATH): | |
| for dn in list(dirnames): | |
| if dn == ".cache": | |
| shutil.rmtree(os.path.join(dirpath, dn), ignore_errors=True) | |
| dirnames.remove(dn) | |
| # Upload each directory to HF dataset | |
| api = HfApi(token=HF_TOKEN) | |
| ts = time.strftime("%Y-%m-%d %H:%M", time.gmtime()) | |
| ok = 0 | |
| fail = 0 | |
| # Upload package list files + log | |
| for pf, name in [(PKG_FILE, "user-packages.list"), (PIP_FILE, "pip-packages.txt"), (LOGFILE, "huggingrun.log")]: | |
| if os.path.exists(pf): | |
| try: | |
| api.upload_file( | |
| path_or_fileobj=pf, path_in_repo=name, | |
| repo_id=HF_DATASET_REPO, repo_type="dataset", | |
| commit_message=f"sync {ts}: {name}", | |
| ) | |
| except Exception: | |
| pass | |
| # Upload each sync dir | |
| for d in SYNC_DIRS: | |
| local = os.path.join(PERSIST_PATH, d.lstrip("/")) | |
| if not os.path.isdir(local): | |
| continue | |
| repo_path = d.lstrip("/") | |
| # Count files | |
| try: | |
| fc = int(subprocess.check_output( | |
| f"find '{local}' -type f | wc -l", shell=True, text=True).strip()) | |
| except Exception: | |
| fc = 0 | |
| if fc == 0: | |
| continue | |
| # Upload with retry | |
| for attempt in range(3): | |
| t0 = time.time() | |
| try: | |
| api.upload_folder( | |
| folder_path=local, | |
| repo_id=HF_DATASET_REPO, repo_type="dataset", | |
| path_in_repo=repo_path, | |
| commit_message=f"sync {ts}: {repo_path}/", | |
| ignore_patterns=UPLOAD_IGNORE, | |
| ) | |
| log(f" {repo_path}/ ok ({time.time()-t0:.1f}s, {fc} files)") | |
| ok += 1 | |
| break | |
| except Exception as e: | |
| err = str(e).split('\n')[0][:100] | |
| if attempt < 2: | |
| log(f" {repo_path}/ retry {attempt+1}: {err}") | |
| time.sleep((attempt + 1) * 5) | |
| else: | |
| log(f" {repo_path}/ FAILED: {err}") | |
| fail += 1 | |
| time.sleep(2) # rate limit between dirs | |
| elapsed = time.time() - t0_total | |
| log(f"ββ SYNC: done ({elapsed:.1f}s) {ok} ok, {fail} failed") | |
| # ββ Background Threads ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| restore_done = threading.Event() | |
| def background_restore(): | |
| try: | |
| restore() | |
| ensure_passwords() | |
| except Exception as e: | |
| log(f"ββ RESTORE error: {e}") | |
| finally: | |
| restore_done.set() | |
| def sync_loop(): | |
| restore_done.wait() | |
| log("sync: ready, first sync in 30s") | |
| time.sleep(30) | |
| cycle = 0 | |
| while True: | |
| cycle += 1 | |
| log(f"ββ sync #{cycle}") | |
| try: | |
| save_and_upload() | |
| except Exception as e: | |
| log(f" sync error: {e}") | |
| time.sleep(SYNC_INTERVAL) | |
| def heartbeat_loop(): | |
| while True: | |
| time.sleep(60) | |
| try: | |
| _, load = sh("cat /proc/loadavg | cut -d' ' -f1-3") | |
| _, mem = sh("free -h | grep Mem: | tr -s ' ' | cut -d' ' -f2,3 | sed 's/ /\\//'") | |
| log(f"heartbeat: load={load} mem={mem}") | |
| except Exception: | |
| pass | |
| # ββ Log Streamer (SSE on port 7863) ββββββββββββββββββββββββββββββββββ | |
| class LogSSEHandler(http.server.BaseHTTPRequestHandler): | |
| def log_message(self, *a): | |
| pass | |
| def do_GET(self): | |
| if self.path != "/stream": | |
| self.send_response(404) | |
| self.end_headers() | |
| return | |
| self.send_response(200) | |
| self.send_header("Content-Type", "text/event-stream") | |
| self.send_header("Cache-Control", "no-cache") | |
| self.send_header("Access-Control-Allow-Origin", "*") | |
| self.end_headers() | |
| try: | |
| with open(LOGFILE) as f: | |
| # Send existing log | |
| for line in f: | |
| line = line.rstrip("\n") | |
| if line: | |
| ev = json.dumps({"data": line}) | |
| self.wfile.write(f"data: {ev}\n\n".encode()) | |
| self.wfile.flush() | |
| # Tail new lines | |
| while True: | |
| line = f.readline() | |
| if line: | |
| line = line.rstrip("\n") | |
| if line: | |
| ev = json.dumps({"data": line}) | |
| self.wfile.write(f"data: {ev}\n\n".encode()) | |
| self.wfile.flush() | |
| else: | |
| self.wfile.write(b": keepalive\n\n") | |
| self.wfile.flush() | |
| time.sleep(1) | |
| except (BrokenPipeError, ConnectionResetError): | |
| pass | |
| # ββ Service Management ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def ensure_passwords(): | |
| sh("id tao-shen >/dev/null 2>&1 || useradd -m -s /bin/bash tao-shen") | |
| sh('echo "tao-shen:huggingrun" | chpasswd') | |
| sh('echo "root:huggingrun" | chpasswd') | |
| sh("usermod -aG sudo tao-shen 2>/dev/null || true") | |
| sh("echo 'tao-shen ALL=(ALL) NOPASSWD:ALL' > /etc/sudoers.d/tao-shen") | |
| sh("chmod 440 /etc/sudoers.d/tao-shen") | |
| # Fix home dir ownership (restore may bring old uid mapping) | |
| sh("chown tao-shen:tao-shen /home/tao-shen") | |
| # Copy root's SSH keys to tao-shen if needed | |
| if os.path.exists("/root/.ssh/authorized_keys"): | |
| sh("mkdir -p /home/tao-shen/.ssh") | |
| sh("cp -n /root/.ssh/authorized_keys /home/tao-shen/.ssh/authorized_keys 2>/dev/null || true") | |
| sh("chown -R tao-shen:tao-shen /home/tao-shen/.ssh") | |
| sh("chmod 700 /home/tao-shen/.ssh && chmod 600 /home/tao-shen/.ssh/authorized_keys") | |
| sh("ldconfig 2>/dev/null || true") | |
| def start_sshd(): | |
| os.makedirs("/run/sshd", exist_ok=True) | |
| proc = subprocess.Popen([ | |
| "/usr/sbin/sshd", "-D", "-e", | |
| "-o", f"Port={SSH_PORT}", | |
| "-o", "ListenAddress=127.0.0.1", | |
| "-o", "PermitRootLogin=yes", | |
| "-o", "PasswordAuthentication=yes", | |
| "-o", "PermitEmptyPasswords=no", | |
| "-o", "UsePAM=yes", | |
| ]) | |
| time.sleep(0.5) | |
| log(f"[OK] sshd PID={proc.pid}" if proc.poll() is None else "[FAIL] sshd") | |
| return proc | |
| def start_ttyd(): | |
| proc = subprocess.Popen([ | |
| "ttyd", "--port", TTYD_PORT, "--writable", "--base-path", "/", | |
| "login", "-f", "tao-shen", | |
| ]) | |
| time.sleep(0.5) | |
| log(f"[OK] ttyd PID={proc.pid}" if proc.poll() is None else "[FAIL] ttyd") | |
| return proc | |
| def start_ws_bridge(): | |
| proc = subprocess.Popen([sys.executable, "/ws_ssh_bridge.py"]) | |
| time.sleep(0.5) | |
| log(f"[OK] ws-bridge PID={proc.pid}" if proc.poll() is None else "[FAIL] ws-bridge") | |
| return proc | |
| def start_log_streamer(): | |
| srv = http.server.HTTPServer(("127.0.0.1", 7863), LogSSEHandler) | |
| threading.Thread(target=srv.serve_forever, daemon=True).start() | |
| log("[OK] log-streamer :7863") | |
| # ββ Main ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def main(): | |
| os.makedirs(PERSIST_PATH, exist_ok=True) | |
| open(LOGFILE, "a").close() | |
| sh("hostname huggingrun") | |
| resolve_config() | |
| ensure_dataset_repo() | |
| ensure_passwords() | |
| # Write env for other processes | |
| with open("/etc/huggingrun.env", "w") as f: | |
| f.write(f'export HF_TOKEN="{HF_TOKEN}"\n') | |
| f.write(f'export HF_DATASET_REPO="{HF_DATASET_REPO}"\n') | |
| # Start services (open port 7860 ASAP to avoid HF timeout) | |
| start_sshd() | |
| start_ws_bridge() | |
| start_ttyd() | |
| start_log_streamer() | |
| nginx_proc = subprocess.Popen( | |
| ["nginx", "-c", "/etc/nginx/nginx.conf", "-g", "daemon off;"] | |
| ) | |
| log(f"[OK] nginx PID={nginx_proc.pid}") | |
| log("=" * 50) | |
| log("READY β restore runs in background") | |
| log("=" * 50) | |
| # Background restore + sync | |
| threading.Thread(target=background_restore, daemon=True).start() | |
| threading.Thread(target=sync_loop, daemon=True).start() | |
| threading.Thread(target=heartbeat_loop, daemon=True).start() | |
| # Graceful shutdown | |
| def on_signal(sig, frame): | |
| log(f"signal {sig} β final save") | |
| nginx_proc.terminate() | |
| try: | |
| save_and_upload() | |
| except Exception as e: | |
| log(f"final save error: {e}") | |
| sys.exit(0) | |
| signal.signal(signal.SIGTERM, on_signal) | |
| signal.signal(signal.SIGINT, on_signal) | |
| nginx_proc.wait() | |
| if __name__ == "__main__": | |
| main() | |