HuggingRun

Sleeping

App Files Files Community

tao-shen commited on 12 days ago

Commit

fe5f1bf

1 Parent(s): 2c2cdd4

chore: add remote logs doc + monitor script, restore Dockerfile pip step

Browse files

Files changed (3) hide show

README.md +2 -1
docs/REMOTE_LOGS.md +31 -0
scripts/monitor_and_test.py +169 -0

README.md CHANGED Viewed

@@ -44,7 +44,8 @@ HuggingRun 是面向 Hugging Face Spaces 的**通用部署接口**：用同一
 - **单端口约定**：应用只需监听 `APP_PORT`（默认 7860）；多端口服务需自己在容器内做反向代理。
 - **统一入口**：同一 entrypoint 先做恢复与同步，再 `exec` 你的 `RUN_CMD`，便于任意镜像复用。
-详见 [docs/HF_LIMITATIONS.md](docs/HF_LIMITATIONS.md)。
 ## 示例（最小用法）

 - **单端口约定**：应用只需监听 `APP_PORT`（默认 7860）；多端口服务需自己在容器内做反向代理。
 - **统一入口**：同一 entrypoint 先做恢复与同步，再 `exec` 你的 `RUN_CMD`，便于任意镜像复用。
+详见 [docs/HF_LIMITATIONS.md](docs/HF_LIMITATIONS.md)。
+远端构建/运行日志（本地 debug）：[docs/REMOTE_LOGS.md](docs/REMOTE_LOGS.md)。
 ## 示例（最小用法）

docs/REMOTE_LOGS.md ADDED Viewed

	@@ -0,0 +1,31 @@

+# 远端 Space 日志（本地 debug 用）
+用 HF API 拉取构建/运行日志，便于本地排查问题。需 `HF_TOKEN`（有该 Space 权限）。
+## 容器运行日志（SSE）
+```bash
+curl -N \
+  -H "Authorization: Bearer $HF_TOKEN" \
+  "https://huggingface.co/api/spaces/tao-shen/HuggingRun/logs/run"
+```
+## 构建日志（SSE）
+```bash
+curl -N \
+  -H "Authorization: Bearer $HF_TOKEN" \
+  "https://huggingface.co/api/spaces/tao-shen/HuggingRun/logs/build"
+```
+将 `tao-shen/HuggingRun` 换成你的 `SPACE_ID`（例如 `你的用户名/你的Space名`）。
+## 在脚本里用
+```bash
+# 拉取最近一段运行日志（Ctrl+C 结束）
+HF_TOKEN=your_token ./scripts/monitor_and_test.py --logs run
+# 拉取构建日志
+HF_TOKEN=your_token ./scripts/monitor_and_test.py --logs build
+```

scripts/monitor_and_test.py ADDED Viewed

	@@ -0,0 +1,169 @@

+#!/usr/bin/env python3
+"""
+HuggingRun: 监控远端 Space 状态并执行基础/压力/持久化验证。
+用法:
+  HF_TOKEN=xxx python3 scripts/monitor_and_test.py [--space-id tao-shen/HuggingRun] [--wait-running] [--test]
+  HF_TOKEN=xxx python3 scripts/monitor_and_test.py --logs run   # 流式拉取容器运行日志 (SSE)
+  HF_TOKEN=xxx python3 scripts/monitor_and_test.py --logs build # 流式拉取构建日志 (SSE)
+"""
+import argparse
+import os
+import sys
+import time
+import urllib.request
+import urllib.error
+SPACE_ID = os.environ.get("SPACE_ID", "tao-shen/HuggingRun")
+HF_LOGS_BASE = "https://huggingface.co/api/spaces"
+# HF Space app URL (replace / with - and often lowercase)
+APP_URL = os.environ.get("APP_URL", "https://tao-shen-huggingrun.hf.space")
+def get_runtime():
+    try:
+        from huggingface_hub import HfApi
+        token = os.environ.get("HF_TOKEN")
+        if not token:
+            return None, "HF_TOKEN not set"
+        api = HfApi(token=token)
+        rt = api.get_space_runtime(SPACE_ID)
+        return rt, None
+    except Exception as e:
+        return None, str(e)
+def wait_running(max_wait_sec=600, poll_interval=15):
+    """轮询直到 stage == RUNNING 或超时。"""
+    start = time.time()
+    while (time.time() - start) < max_wait_sec:
+        rt, err = get_runtime()
+        if err:
+            print(f"[monitor] get_runtime error: {err}")
+        elif rt:
+            stage = getattr(rt, "stage", None) or (rt.raw or {}).get("stage")
+            print(f"[monitor] Space {SPACE_ID} stage={stage}")
+            if stage == "RUNNING":
+                return True
+            if stage == "ERROR" or stage == "BUILD_ERROR":
+                print(f"[monitor] Space in error state: {stage}")
+                return False
+        time.sleep(poll_interval)
+    print("[monitor] Timeout waiting for RUNNING")
+    return False
+def http_get(url, timeout=30):
+    try:
+        req = urllib.request.Request(url, method="GET")
+        with urllib.request.urlopen(req, timeout=timeout) as resp:
+            return resp.status, resp.read().decode("utf-8", errors="replace")
+    except urllib.error.HTTPError as e:
+        return e.code, e.read().decode("utf-8", errors="replace") if e.fp else ""
+    except Exception as e:
+        return -1, str(e)
+def test_basic(url, expect_substring="HuggingRun"):
+    status, body = http_get(url)
+    ok = status == 200 and (expect_substring in body or "Run anything" in body)
+    print(f"[test] GET {url} -> {status}, body contains expected: {expect_substring in body or 'Run anything' in body}")
+    return ok
+def test_stress(url, n=50, concurrency=10):
+    """连续请求 n 次（简单串行），检查均返回 200。"""
+    import concurrent.futures
+    failed = 0
+    def one(i):
+        s, _ = http_get(url, timeout=15)
+        return s == 200
+    with concurrent.futures.ThreadPoolExecutor(max_workers=concurrency) as ex:
+        results = list(ex.map(one, range(n)))
+    passed = sum(results)
+    failed = n - passed
+    print(f"[stress] {n} requests: {passed} ok, {failed} failed")
+    return failed == 0
+def test_persistence(url, rounds=3):
+    """多轮访问，检查页面内容中计数或状态会变化/保留（demo 页有 Visit count）。"""
+    counts = []
+    for _ in range(rounds):
+        status, body = http_get(url)
+        if status != 200:
+            return False
+        # Demo 页有 "Visit count (persisted): N"
+        if "Visit count" in body or "total_visits" in body or "persisted" in body:
+            counts.append(1)
+        time.sleep(1)
+    print(f"[persistence] {rounds} rounds, body contained persistence keywords: {len(counts) == rounds}")
+    return len(counts) >= 1  # 至少有一轮包含持久化相关文案即认为可接受
+def stream_logs(space_id: str, log_type: str):
+    """Stream build or run logs (SSE). Requires HF_TOKEN."""
+    token = os.environ.get("HF_TOKEN")
+    if not token:
+        print("HF_TOKEN required for --logs", file=sys.stderr)
+        sys.exit(1)
+    url = f"{HF_LOGS_BASE}/{space_id}/logs/{log_type}"
+    req = urllib.request.Request(url, method="GET")
+    req.add_header("Authorization", f"Bearer {token}")
+    try:
+        with urllib.request.urlopen(req, timeout=5) as resp:
+            while True:
+                chunk = resp.read(4096)
+                if not chunk:
+                    break
+                sys.stdout.buffer.write(chunk)
+                sys.stdout.flush()
+    except Exception as e:
+        print(f"Logs error: {e}", file=sys.stderr)
+        sys.exit(1)
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--space-id", default=SPACE_ID)
+    p.add_argument("--url", default=APP_URL)
+    p.add_argument("--wait-running", action="store_true", help="Poll until Space is RUNNING")
+    p.add_argument("--test", action="store_true", help="Run basic + stress + persistence tests")
+    p.add_argument("--logs", choices=("build", "run"), help="Stream logs: build or run (SSE)")
+    p.add_argument("--stress-n", type=int, default=50)
+    p.add_argument("--max-wait", type=int, default=600)
+    args = p.parse_args()
+    global SPACE_ID, APP_URL
+    SPACE_ID = args.space_id
+    APP_URL = args.url.rstrip("/")
+    if args.logs:
+        stream_logs(SPACE_ID, args.logs)
+        return
+    if args.wait_running:
+        ok = wait_running(max_wait_sec=args.max_wait)
+        if not ok:
+            sys.exit(1)
+    if args.test:
+        print(f"[test] Target: {APP_URL}")
+        if not test_basic(APP_URL):
+            print("[test] BASIC FAILED")
+            sys.exit(1)
+        if not test_stress(APP_URL, n=args.stress_n):
+            print("[test] STRESS FAILED")
+            sys.exit(1)
+        if not test_persistence(APP_URL):
+            print("[test] PERSISTENCE CHECK (keyword) FAILED")
+            sys.exit(1)
+        print("[test] ALL PASSED")
+    else:
+        rt, err = get_runtime()
+        if err:
+            print("Runtime:", err)
+        else:
+            print("Runtime:", getattr(rt, "stage", rt.raw))
+if __name__ == "__main__":
+    main()