HuggingRun / entrypoint.py
tao-shen's picture
fix: chown /home/tao-shen after restore (old uid mapping)
444d40a
#!/usr/bin/env python3
"""
HuggingRun v2 β€” Ubuntu Server on HuggingFace Spaces.
Persistence via HF Dataset (direct file sync, no archives):
- Sync scope: /home, /root, /usr/local, /opt, /var/lib, /etc
- System packages: saved as package name list, restored via apt install
- Symlinks/permissions safe: no system binary dirs synced
"""
import http.server
import json
import os
import shutil
import signal
import subprocess
import sys
import threading
import time
from datetime import datetime, timezone
# ── Config ────────────────────────────────────────────────────────────
PERSIST_PATH = os.environ.get("PERSIST_PATH", "/data")
HF_TOKEN = os.environ.get("HF_TOKEN", "")
HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "")
SYNC_INTERVAL = int(os.environ.get("SYNC_INTERVAL", "60"))
SSH_PORT = os.environ.get("SSH_PORT", "2222")
TTYD_PORT = os.environ.get("TTYD_PORT", "7681")
LOGFILE = "/var/log/huggingrun.log"
BASE_PKG_FILE = "/etc/base-packages.list"
PKG_FILE = os.path.join(PERSIST_PATH, "user-packages.list")
PIP_FILE = os.path.join(PERSIST_PATH, "pip-packages.txt")
# Directories to sync (user data only, no system binaries)
SYNC_DIRS = ["/home", "/root", "/usr/local", "/opt", "/var/lib", "/var/log", "/etc"]
# Exclude from rsync (relative paths β€” rsync matches from transfer root)
RSYNC_EXCLUDES = ["*.sock", "*.pid", "*.lock"]
# Extra excludes only for /etc/ restore (container-managed files)
ETC_RESTORE_EXCLUDES = [
"hostname", "hosts", "resolv.conf", "mtab", "fstab",
"alternatives", # symlinks managed by dpkg
"ld.so.cache", # rebuilt by ldconfig
]
# Exclude from HF upload
UPLOAD_IGNORE = [
"__pycache__", "*.pyc", ".git", ".git*",
"*.sock", "*.lock", ".huggingface", ".cache",
"huggingrun.env", # contains HF_TOKEN
]
# ── Logging ───────────────────────────────────────────────────────────
def log(msg):
ts = time.strftime("%H:%M:%S", time.gmtime())
line = f"[{ts}] {msg}"
print(line, file=sys.stderr, flush=True)
try:
with open(LOGFILE, "a") as f:
f.write(line + "\n")
except Exception:
pass
def sh(cmd, log_err=False):
"""Run shell command, return (exit_code, stdout)."""
r = subprocess.run(cmd, shell=True, capture_output=True, text=True)
if log_err and r.returncode != 0 and r.stderr:
log(f" cmd error: {r.stderr.strip()[:200]}")
return r.returncode, (r.stdout or "").strip()
# ── Config Resolution ─────────────────────────────────────────────────
def resolve_config():
global HF_TOKEN, HF_DATASET_REPO
HF_TOKEN = os.environ.get("HF_TOKEN", "")
if not HF_DATASET_REPO:
space_id = os.environ.get("SPACE_ID", "")
if space_id:
HF_DATASET_REPO = f"{space_id}-data"
elif HF_TOKEN:
try:
from huggingface_hub import HfApi
name = HfApi(token=HF_TOKEN).whoami()["name"]
HF_DATASET_REPO = f"{name}/HuggingRun-data"
except Exception:
pass
os.environ["HF_DATASET_REPO"] = HF_DATASET_REPO
log("=" * 50)
log("HuggingRun v2")
log(f" HF_TOKEN: {'set' if HF_TOKEN else 'NOT SET'}")
log(f" HF_DATASET_REPO: {HF_DATASET_REPO or 'NOT SET'}")
log(f" SYNC_DIRS: {SYNC_DIRS}")
log(f" SYNC_INTERVAL: {SYNC_INTERVAL}s")
log("=" * 50)
def ensure_dataset_repo():
if not HF_TOKEN or not HF_DATASET_REPO:
return
from huggingface_hub import HfApi
api = HfApi(token=HF_TOKEN)
try:
api.repo_info(repo_id=HF_DATASET_REPO, repo_type="dataset")
log(f"dataset: {HF_DATASET_REPO}")
except Exception:
api.create_repo(repo_id=HF_DATASET_REPO, repo_type="dataset", private=True)
log(f"created dataset: {HF_DATASET_REPO}")
# ── Restore ───────────────────────────────────────────────────────────
def restore():
"""Download dataset β†’ rsync to / β†’ reinstall apt packages."""
if not HF_TOKEN or not HF_DATASET_REPO:
return
# Download dataset
log("── RESTORE: downloading dataset")
os.makedirs(PERSIST_PATH, exist_ok=True)
t0 = time.time()
try:
from huggingface_hub import snapshot_download
snapshot_download(
repo_id=HF_DATASET_REPO, repo_type="dataset",
local_dir=PERSIST_PATH, token=HF_TOKEN,
)
log(f" downloaded ({time.time()-t0:.1f}s)")
except Exception as e:
log(f" download failed: {e}")
return
# Check if there's data to restore
has_data = any(
os.path.isdir(os.path.join(PERSIST_PATH, d.lstrip("/")))
for d in SYNC_DIRS
)
if not has_data:
log(" empty dataset, fresh start")
return
# Rsync each directory back (no --delete, don't remove existing files)
log("── RESTORE: rsync /data/ β†’ /")
base_excludes = " ".join(f"--exclude='{e}'" for e in RSYNC_EXCLUDES)
etc_excludes = " ".join(f"--exclude='{e}'" for e in ETC_RESTORE_EXCLUDES)
for d in SYNC_DIRS:
src = os.path.join(PERSIST_PATH, d.lstrip("/"))
if not os.path.isdir(src):
continue
excludes = base_excludes
if d == "/etc":
excludes = f"{base_excludes} {etc_excludes}"
cmd = f"rsync -rlptD {excludes} '{src}/' '{d}/'"
rc, _ = sh(cmd, log_err=True)
if rc == 0:
log(f" {d}/ restored")
else:
log(f" {d}/ restore failed (rc={rc})")
# Reinstall apt packages
if os.path.exists(PKG_FILE) and os.path.exists(BASE_PKG_FILE):
with open(BASE_PKG_FILE) as f:
base = set(f.read().split())
with open(PKG_FILE) as f:
saved = set(f.read().split())
to_install = sorted(saved - base)
if to_install:
log(f"── RESTORE: apt install {len(to_install)} packages")
rc, _ = sh(f"apt-get update -qq && apt-get install -y --reinstall --no-install-recommends {' '.join(to_install)}")
if rc == 0:
log(f" packages restored")
else:
log(f" some packages failed")
else:
log("── RESTORE: no extra packages")
else:
log("── RESTORE: no package list")
# Reinstall pip packages
if os.path.exists(PIP_FILE):
with open(PIP_FILE) as f:
pip_pkgs = [l.strip() for l in f if l.strip() and not l.startswith('#')]
if pip_pkgs:
log(f"── RESTORE: pip install {len(pip_pkgs)} packages")
rc, _ = sh(f"pip install --break-system-packages -q -r '{PIP_FILE}'")
if rc == 0:
log(f" pip packages restored")
else:
log(f" some pip packages failed")
# Fix SSH key permissions (HF dataset doesn't preserve permissions)
sh("chmod 600 /etc/ssh/ssh_host_*_key 2>/dev/null")
sh("chmod 644 /etc/ssh/ssh_host_*_key.pub 2>/dev/null")
# Restart sshd to pick up restored host keys
sh("pkill sshd")
time.sleep(1)
os.makedirs("/run/sshd", exist_ok=True)
proc = subprocess.Popen([
"/usr/sbin/sshd", "-D", "-e",
"-o", f"Port={SSH_PORT}",
"-o", "ListenAddress=127.0.0.1",
"-o", "PermitRootLogin=yes",
"-o", "PasswordAuthentication=yes",
"-o", "PermitEmptyPasswords=no",
"-o", "UsePAM=yes",
])
time.sleep(0.5)
if proc.poll() is None:
log(f" sshd restarted ok PID={proc.pid}")
else:
log(f" sshd restart FAILED (exit={proc.poll()})")
# Fix up after restore
sh('ldconfig 2>/dev/null')
log("── RESTORE: complete")
# ── Save + Upload ─────────────────────────────────────────────────────
def save_and_upload():
"""Rsync sync dirs to /data/, then upload to HF dataset."""
if not HF_TOKEN or not HF_DATASET_REPO:
return
from huggingface_hub import HfApi
log("── SYNC: start")
t0_total = time.time()
# Save apt package list
rc, out = sh("dpkg-query -W -f='${Package}\\n'")
if rc == 0 and out:
with open(PKG_FILE, "w") as f:
f.write(out + "\n")
# Save pip package list (user-installed only, exclude base)
rc, out = sh("pip freeze --exclude-editable 2>/dev/null")
if rc == 0 and out:
with open(PIP_FILE, "w") as f:
f.write(out + "\n")
# Rsync each sync dir β†’ /data/
excludes = " ".join(f"--exclude='{e}'" for e in RSYNC_EXCLUDES)
for d in SYNC_DIRS:
if not os.path.isdir(d):
continue
dst = os.path.join(PERSIST_PATH, d.lstrip("/"))
os.makedirs(dst, exist_ok=True)
cmd = f"rsync -rlptD --delete {excludes} '{d}/' '{dst}/'"
sh(cmd)
# Clean .cache dirs (HF API rejects .cache paths)
for dirpath, dirnames, _ in os.walk(PERSIST_PATH):
for dn in list(dirnames):
if dn == ".cache":
shutil.rmtree(os.path.join(dirpath, dn), ignore_errors=True)
dirnames.remove(dn)
# Upload each directory to HF dataset
api = HfApi(token=HF_TOKEN)
ts = time.strftime("%Y-%m-%d %H:%M", time.gmtime())
ok = 0
fail = 0
# Upload package list files + log
for pf, name in [(PKG_FILE, "user-packages.list"), (PIP_FILE, "pip-packages.txt"), (LOGFILE, "huggingrun.log")]:
if os.path.exists(pf):
try:
api.upload_file(
path_or_fileobj=pf, path_in_repo=name,
repo_id=HF_DATASET_REPO, repo_type="dataset",
commit_message=f"sync {ts}: {name}",
)
except Exception:
pass
# Upload each sync dir
for d in SYNC_DIRS:
local = os.path.join(PERSIST_PATH, d.lstrip("/"))
if not os.path.isdir(local):
continue
repo_path = d.lstrip("/")
# Count files
try:
fc = int(subprocess.check_output(
f"find '{local}' -type f | wc -l", shell=True, text=True).strip())
except Exception:
fc = 0
if fc == 0:
continue
# Upload with retry
for attempt in range(3):
t0 = time.time()
try:
api.upload_folder(
folder_path=local,
repo_id=HF_DATASET_REPO, repo_type="dataset",
path_in_repo=repo_path,
commit_message=f"sync {ts}: {repo_path}/",
ignore_patterns=UPLOAD_IGNORE,
)
log(f" {repo_path}/ ok ({time.time()-t0:.1f}s, {fc} files)")
ok += 1
break
except Exception as e:
err = str(e).split('\n')[0][:100]
if attempt < 2:
log(f" {repo_path}/ retry {attempt+1}: {err}")
time.sleep((attempt + 1) * 5)
else:
log(f" {repo_path}/ FAILED: {err}")
fail += 1
time.sleep(2) # rate limit between dirs
elapsed = time.time() - t0_total
log(f"── SYNC: done ({elapsed:.1f}s) {ok} ok, {fail} failed")
# ── Background Threads ────────────────────────────────────────────────
restore_done = threading.Event()
def background_restore():
try:
restore()
ensure_passwords()
except Exception as e:
log(f"── RESTORE error: {e}")
finally:
restore_done.set()
def sync_loop():
restore_done.wait()
log("sync: ready, first sync in 30s")
time.sleep(30)
cycle = 0
while True:
cycle += 1
log(f"── sync #{cycle}")
try:
save_and_upload()
except Exception as e:
log(f" sync error: {e}")
time.sleep(SYNC_INTERVAL)
def heartbeat_loop():
while True:
time.sleep(60)
try:
_, load = sh("cat /proc/loadavg | cut -d' ' -f1-3")
_, mem = sh("free -h | grep Mem: | tr -s ' ' | cut -d' ' -f2,3 | sed 's/ /\\//'")
log(f"heartbeat: load={load} mem={mem}")
except Exception:
pass
# ── Log Streamer (SSE on port 7863) ──────────────────────────────────
class LogSSEHandler(http.server.BaseHTTPRequestHandler):
def log_message(self, *a):
pass
def do_GET(self):
if self.path != "/stream":
self.send_response(404)
self.end_headers()
return
self.send_response(200)
self.send_header("Content-Type", "text/event-stream")
self.send_header("Cache-Control", "no-cache")
self.send_header("Access-Control-Allow-Origin", "*")
self.end_headers()
try:
with open(LOGFILE) as f:
# Send existing log
for line in f:
line = line.rstrip("\n")
if line:
ev = json.dumps({"data": line})
self.wfile.write(f"data: {ev}\n\n".encode())
self.wfile.flush()
# Tail new lines
while True:
line = f.readline()
if line:
line = line.rstrip("\n")
if line:
ev = json.dumps({"data": line})
self.wfile.write(f"data: {ev}\n\n".encode())
self.wfile.flush()
else:
self.wfile.write(b": keepalive\n\n")
self.wfile.flush()
time.sleep(1)
except (BrokenPipeError, ConnectionResetError):
pass
# ── Service Management ────────────────────────────────────────────────
def ensure_passwords():
sh("id tao-shen >/dev/null 2>&1 || useradd -m -s /bin/bash tao-shen")
sh('echo "tao-shen:huggingrun" | chpasswd')
sh('echo "root:huggingrun" | chpasswd')
sh("usermod -aG sudo tao-shen 2>/dev/null || true")
sh("echo 'tao-shen ALL=(ALL) NOPASSWD:ALL' > /etc/sudoers.d/tao-shen")
sh("chmod 440 /etc/sudoers.d/tao-shen")
# Fix home dir ownership (restore may bring old uid mapping)
sh("chown tao-shen:tao-shen /home/tao-shen")
# Copy root's SSH keys to tao-shen if needed
if os.path.exists("/root/.ssh/authorized_keys"):
sh("mkdir -p /home/tao-shen/.ssh")
sh("cp -n /root/.ssh/authorized_keys /home/tao-shen/.ssh/authorized_keys 2>/dev/null || true")
sh("chown -R tao-shen:tao-shen /home/tao-shen/.ssh")
sh("chmod 700 /home/tao-shen/.ssh && chmod 600 /home/tao-shen/.ssh/authorized_keys")
sh("ldconfig 2>/dev/null || true")
def start_sshd():
os.makedirs("/run/sshd", exist_ok=True)
proc = subprocess.Popen([
"/usr/sbin/sshd", "-D", "-e",
"-o", f"Port={SSH_PORT}",
"-o", "ListenAddress=127.0.0.1",
"-o", "PermitRootLogin=yes",
"-o", "PasswordAuthentication=yes",
"-o", "PermitEmptyPasswords=no",
"-o", "UsePAM=yes",
])
time.sleep(0.5)
log(f"[OK] sshd PID={proc.pid}" if proc.poll() is None else "[FAIL] sshd")
return proc
def start_ttyd():
proc = subprocess.Popen([
"ttyd", "--port", TTYD_PORT, "--writable", "--base-path", "/",
"login", "-f", "tao-shen",
])
time.sleep(0.5)
log(f"[OK] ttyd PID={proc.pid}" if proc.poll() is None else "[FAIL] ttyd")
return proc
def start_ws_bridge():
proc = subprocess.Popen([sys.executable, "/ws_ssh_bridge.py"])
time.sleep(0.5)
log(f"[OK] ws-bridge PID={proc.pid}" if proc.poll() is None else "[FAIL] ws-bridge")
return proc
def start_log_streamer():
srv = http.server.HTTPServer(("127.0.0.1", 7863), LogSSEHandler)
threading.Thread(target=srv.serve_forever, daemon=True).start()
log("[OK] log-streamer :7863")
# ── Main ──────────────────────────────────────────────────────────────
def main():
os.makedirs(PERSIST_PATH, exist_ok=True)
open(LOGFILE, "a").close()
sh("hostname huggingrun")
resolve_config()
ensure_dataset_repo()
ensure_passwords()
# Write env for other processes
with open("/etc/huggingrun.env", "w") as f:
f.write(f'export HF_TOKEN="{HF_TOKEN}"\n')
f.write(f'export HF_DATASET_REPO="{HF_DATASET_REPO}"\n')
# Start services (open port 7860 ASAP to avoid HF timeout)
start_sshd()
start_ws_bridge()
start_ttyd()
start_log_streamer()
nginx_proc = subprocess.Popen(
["nginx", "-c", "/etc/nginx/nginx.conf", "-g", "daemon off;"]
)
log(f"[OK] nginx PID={nginx_proc.pid}")
log("=" * 50)
log("READY β€” restore runs in background")
log("=" * 50)
# Background restore + sync
threading.Thread(target=background_restore, daemon=True).start()
threading.Thread(target=sync_loop, daemon=True).start()
threading.Thread(target=heartbeat_loop, daemon=True).start()
# Graceful shutdown
def on_signal(sig, frame):
log(f"signal {sig} β€” final save")
nginx_proc.terminate()
try:
save_and_upload()
except Exception as e:
log(f"final save error: {e}")
sys.exit(0)
signal.signal(signal.SIGTERM, on_signal)
signal.signal(signal.SIGINT, on_signal)
nginx_proc.wait()
if __name__ == "__main__":
main()