tao-shen commited on
Commit
fe5f1bf
·
1 Parent(s): 2c2cdd4

chore: add remote logs doc + monitor script, restore Dockerfile pip step

Browse files
Files changed (3) hide show
  1. README.md +2 -1
  2. docs/REMOTE_LOGS.md +31 -0
  3. scripts/monitor_and_test.py +169 -0
README.md CHANGED
@@ -44,7 +44,8 @@ HuggingRun 是面向 Hugging Face Spaces 的**通用部署接口**:用同一
44
  - **单端口约定**:应用只需监听 `APP_PORT`(默认 7860);多端口服务需自己在容器内做反向代理。
45
  - **统一入口**:同一 entrypoint 先做恢复与同步,再 `exec` 你的 `RUN_CMD`,便于任意镜像复用。
46
 
47
- 详见 [docs/HF_LIMITATIONS.md](docs/HF_LIMITATIONS.md)。
 
48
 
49
  ## 示例(最小用法)
50
 
 
44
  - **单端口约定**:应用只需监听 `APP_PORT`(默认 7860);多端口服务需自己在容器内做反向代理。
45
  - **统一入口**:同一 entrypoint 先做恢复与同步,再 `exec` 你的 `RUN_CMD`,便于任意镜像复用。
46
 
47
+ 详见 [docs/HF_LIMITATIONS.md](docs/HF_LIMITATIONS.md)。
48
+ 远端构建/运行日志(本地 debug):[docs/REMOTE_LOGS.md](docs/REMOTE_LOGS.md)。
49
 
50
  ## 示例(最小用法)
51
 
docs/REMOTE_LOGS.md ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 远端 Space 日志(本地 debug 用)
2
+
3
+ 用 HF API 拉取构建/运行日志,便于本地排查问题。需 `HF_TOKEN`(有该 Space 权限)。
4
+
5
+ ## 容器运行日志(SSE)
6
+
7
+ ```bash
8
+ curl -N \
9
+ -H "Authorization: Bearer $HF_TOKEN" \
10
+ "https://huggingface.co/api/spaces/tao-shen/HuggingRun/logs/run"
11
+ ```
12
+
13
+ ## 构建日志(SSE)
14
+
15
+ ```bash
16
+ curl -N \
17
+ -H "Authorization: Bearer $HF_TOKEN" \
18
+ "https://huggingface.co/api/spaces/tao-shen/HuggingRun/logs/build"
19
+ ```
20
+
21
+ 将 `tao-shen/HuggingRun` 换成你的 `SPACE_ID`(例如 `你的用户名/你的Space名`)。
22
+
23
+ ## 在脚本里用
24
+
25
+ ```bash
26
+ # 拉取最近一段运行日志(Ctrl+C 结束)
27
+ HF_TOKEN=your_token ./scripts/monitor_and_test.py --logs run
28
+
29
+ # 拉取构建日志
30
+ HF_TOKEN=your_token ./scripts/monitor_and_test.py --logs build
31
+ ```
scripts/monitor_and_test.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ HuggingRun: 监控远端 Space 状态并执行基础/压力/持久化验证。
4
+ 用法:
5
+ HF_TOKEN=xxx python3 scripts/monitor_and_test.py [--space-id tao-shen/HuggingRun] [--wait-running] [--test]
6
+ HF_TOKEN=xxx python3 scripts/monitor_and_test.py --logs run # 流式拉取容器运行日志 (SSE)
7
+ HF_TOKEN=xxx python3 scripts/monitor_and_test.py --logs build # 流式拉取构建日志 (SSE)
8
+ """
9
+ import argparse
10
+ import os
11
+ import sys
12
+ import time
13
+ import urllib.request
14
+ import urllib.error
15
+
16
+ SPACE_ID = os.environ.get("SPACE_ID", "tao-shen/HuggingRun")
17
+ HF_LOGS_BASE = "https://huggingface.co/api/spaces"
18
+ # HF Space app URL (replace / with - and often lowercase)
19
+ APP_URL = os.environ.get("APP_URL", "https://tao-shen-huggingrun.hf.space")
20
+
21
+
22
+ def get_runtime():
23
+ try:
24
+ from huggingface_hub import HfApi
25
+ token = os.environ.get("HF_TOKEN")
26
+ if not token:
27
+ return None, "HF_TOKEN not set"
28
+ api = HfApi(token=token)
29
+ rt = api.get_space_runtime(SPACE_ID)
30
+ return rt, None
31
+ except Exception as e:
32
+ return None, str(e)
33
+
34
+
35
+ def wait_running(max_wait_sec=600, poll_interval=15):
36
+ """轮询直到 stage == RUNNING 或超时。"""
37
+ start = time.time()
38
+ while (time.time() - start) < max_wait_sec:
39
+ rt, err = get_runtime()
40
+ if err:
41
+ print(f"[monitor] get_runtime error: {err}")
42
+ elif rt:
43
+ stage = getattr(rt, "stage", None) or (rt.raw or {}).get("stage")
44
+ print(f"[monitor] Space {SPACE_ID} stage={stage}")
45
+ if stage == "RUNNING":
46
+ return True
47
+ if stage == "ERROR" or stage == "BUILD_ERROR":
48
+ print(f"[monitor] Space in error state: {stage}")
49
+ return False
50
+ time.sleep(poll_interval)
51
+ print("[monitor] Timeout waiting for RUNNING")
52
+ return False
53
+
54
+
55
+ def http_get(url, timeout=30):
56
+ try:
57
+ req = urllib.request.Request(url, method="GET")
58
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
59
+ return resp.status, resp.read().decode("utf-8", errors="replace")
60
+ except urllib.error.HTTPError as e:
61
+ return e.code, e.read().decode("utf-8", errors="replace") if e.fp else ""
62
+ except Exception as e:
63
+ return -1, str(e)
64
+
65
+
66
+ def test_basic(url, expect_substring="HuggingRun"):
67
+ status, body = http_get(url)
68
+ ok = status == 200 and (expect_substring in body or "Run anything" in body)
69
+ print(f"[test] GET {url} -> {status}, body contains expected: {expect_substring in body or 'Run anything' in body}")
70
+ return ok
71
+
72
+
73
+ def test_stress(url, n=50, concurrency=10):
74
+ """连续请求 n 次(简单串行),检查均返回 200。"""
75
+ import concurrent.futures
76
+ failed = 0
77
+ def one(i):
78
+ s, _ = http_get(url, timeout=15)
79
+ return s == 200
80
+ with concurrent.futures.ThreadPoolExecutor(max_workers=concurrency) as ex:
81
+ results = list(ex.map(one, range(n)))
82
+ passed = sum(results)
83
+ failed = n - passed
84
+ print(f"[stress] {n} requests: {passed} ok, {failed} failed")
85
+ return failed == 0
86
+
87
+
88
+ def test_persistence(url, rounds=3):
89
+ """多轮访问,检查页面内容中计数或状态会变化/保留(demo 页有 Visit count)。"""
90
+ counts = []
91
+ for _ in range(rounds):
92
+ status, body = http_get(url)
93
+ if status != 200:
94
+ return False
95
+ # Demo 页有 "Visit count (persisted): N"
96
+ if "Visit count" in body or "total_visits" in body or "persisted" in body:
97
+ counts.append(1)
98
+ time.sleep(1)
99
+ print(f"[persistence] {rounds} rounds, body contained persistence keywords: {len(counts) == rounds}")
100
+ return len(counts) >= 1 # 至少有一轮包含持久化相关文案即认为可接受
101
+
102
+
103
+ def stream_logs(space_id: str, log_type: str):
104
+ """Stream build or run logs (SSE). Requires HF_TOKEN."""
105
+ token = os.environ.get("HF_TOKEN")
106
+ if not token:
107
+ print("HF_TOKEN required for --logs", file=sys.stderr)
108
+ sys.exit(1)
109
+ url = f"{HF_LOGS_BASE}/{space_id}/logs/{log_type}"
110
+ req = urllib.request.Request(url, method="GET")
111
+ req.add_header("Authorization", f"Bearer {token}")
112
+ try:
113
+ with urllib.request.urlopen(req, timeout=5) as resp:
114
+ while True:
115
+ chunk = resp.read(4096)
116
+ if not chunk:
117
+ break
118
+ sys.stdout.buffer.write(chunk)
119
+ sys.stdout.flush()
120
+ except Exception as e:
121
+ print(f"Logs error: {e}", file=sys.stderr)
122
+ sys.exit(1)
123
+
124
+
125
+ def main():
126
+ p = argparse.ArgumentParser()
127
+ p.add_argument("--space-id", default=SPACE_ID)
128
+ p.add_argument("--url", default=APP_URL)
129
+ p.add_argument("--wait-running", action="store_true", help="Poll until Space is RUNNING")
130
+ p.add_argument("--test", action="store_true", help="Run basic + stress + persistence tests")
131
+ p.add_argument("--logs", choices=("build", "run"), help="Stream logs: build or run (SSE)")
132
+ p.add_argument("--stress-n", type=int, default=50)
133
+ p.add_argument("--max-wait", type=int, default=600)
134
+ args = p.parse_args()
135
+ global SPACE_ID, APP_URL
136
+ SPACE_ID = args.space_id
137
+ APP_URL = args.url.rstrip("/")
138
+
139
+ if args.logs:
140
+ stream_logs(SPACE_ID, args.logs)
141
+ return
142
+
143
+ if args.wait_running:
144
+ ok = wait_running(max_wait_sec=args.max_wait)
145
+ if not ok:
146
+ sys.exit(1)
147
+
148
+ if args.test:
149
+ print(f"[test] Target: {APP_URL}")
150
+ if not test_basic(APP_URL):
151
+ print("[test] BASIC FAILED")
152
+ sys.exit(1)
153
+ if not test_stress(APP_URL, n=args.stress_n):
154
+ print("[test] STRESS FAILED")
155
+ sys.exit(1)
156
+ if not test_persistence(APP_URL):
157
+ print("[test] PERSISTENCE CHECK (keyword) FAILED")
158
+ sys.exit(1)
159
+ print("[test] ALL PASSED")
160
+ else:
161
+ rt, err = get_runtime()
162
+ if err:
163
+ print("Runtime:", err)
164
+ else:
165
+ print("Runtime:", getattr(rt, "stage", rt.raw))
166
+
167
+
168
+ if __name__ == "__main__":
169
+ main()