Spaces:
Running
Running
perf(proxy): gate /debug/tasks stack-depth computation behind query param
Browse filesTask.get_stack(limit=32) walks coroutine frames synchronously and
measurably stalls the event loop when called for 50+ relay tasks during
a reconnect storm. The /debug/tasks snapshot does not need this by
default — the perf-cheap fields (name, coro_qualname, age, done) are
enough for the common case of "which tasks are alive".
- collect_tasks: add with_stack_depth=False kwarg; default leaves
stack_depth=None per entry.
- /debug/tasks: read ?stack=true query parameter and forward it. Default
response is cheap; opt-in explicitly when human-debugging one snapshot.
- Test: verify default entries have stack_depth=None and that
?stack=true produces at least one integer depth.
headroom/proxy/debug_introspection.py
CHANGED
|
@@ -113,6 +113,8 @@ def _age_for_named_task(
|
|
| 113 |
|
| 114 |
def collect_tasks(
|
| 115 |
ws_registry: WebSocketSessionRegistry | None = None,
|
|
|
|
|
|
|
| 116 |
) -> list[dict[str, Any]]:
|
| 117 |
"""Enumerate ``asyncio.all_tasks()`` for /debug/tasks.
|
| 118 |
|
|
@@ -121,6 +123,12 @@ def collect_tasks(
|
|
| 121 |
and ``done``. Sorted by age descending with ``None`` ages sorted
|
| 122 |
after known ages. System noise (``None`` tasks, tasks with no
|
| 123 |
coroutine) is filtered out.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
"""
|
| 125 |
try:
|
| 126 |
tasks = asyncio.all_tasks()
|
|
@@ -145,7 +153,7 @@ def collect_tasks(
|
|
| 145 |
"name": name,
|
| 146 |
"coro_qualname": qualname,
|
| 147 |
"age_seconds": age,
|
| 148 |
-
"stack_depth": _stack_depth(task),
|
| 149 |
"done": bool(task.done()),
|
| 150 |
}
|
| 151 |
entries.append(entry)
|
|
|
|
| 113 |
|
| 114 |
def collect_tasks(
|
| 115 |
ws_registry: WebSocketSessionRegistry | None = None,
|
| 116 |
+
*,
|
| 117 |
+
with_stack_depth: bool = False,
|
| 118 |
) -> list[dict[str, Any]]:
|
| 119 |
"""Enumerate ``asyncio.all_tasks()`` for /debug/tasks.
|
| 120 |
|
|
|
|
| 123 |
and ``done``. Sorted by age descending with ``None`` ages sorted
|
| 124 |
after known ages. System noise (``None`` tasks, tasks with no
|
| 125 |
coroutine) is filtered out.
|
| 126 |
+
|
| 127 |
+
``stack_depth`` is only computed when ``with_stack_depth=True``
|
| 128 |
+
because :meth:`asyncio.Task.get_stack` walks coroutine frames and
|
| 129 |
+
can noticeably stall the event loop during a storm with 50+ relay
|
| 130 |
+
tasks. The default returns ``stack_depth=None``; callers that need
|
| 131 |
+
it (a human debugging one snapshot) can pass ``with_stack_depth=True``.
|
| 132 |
"""
|
| 133 |
try:
|
| 134 |
tasks = asyncio.all_tasks()
|
|
|
|
| 153 |
"name": name,
|
| 154 |
"coro_qualname": qualname,
|
| 155 |
"age_seconds": age,
|
| 156 |
+
"stack_depth": _stack_depth(task) if with_stack_depth else None,
|
| 157 |
"done": bool(task.done()),
|
| 158 |
}
|
| 159 |
entries.append(entry)
|
headroom/proxy/server.py
CHANGED
|
@@ -1335,9 +1335,19 @@ def create_app(config: ProxyConfig | None = None) -> FastAPI:
|
|
| 1335 |
from headroom.proxy.loopback_guard import require_loopback as _require_loopback
|
| 1336 |
|
| 1337 |
@app.get("/debug/tasks", dependencies=[Depends(_require_loopback)])
|
| 1338 |
-
async def debug_tasks():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1339 |
ws_registry = getattr(proxy, "ws_sessions", None)
|
| 1340 |
-
return JSONResponse(
|
|
|
|
|
|
|
|
|
|
| 1341 |
|
| 1342 |
@app.get("/debug/ws-sessions", dependencies=[Depends(_require_loopback)])
|
| 1343 |
async def debug_ws_sessions():
|
|
|
|
| 1335 |
from headroom.proxy.loopback_guard import require_loopback as _require_loopback
|
| 1336 |
|
| 1337 |
@app.get("/debug/tasks", dependencies=[Depends(_require_loopback)])
|
| 1338 |
+
async def debug_tasks(stack: bool = False):
|
| 1339 |
+
"""Enumerate running asyncio tasks.
|
| 1340 |
+
|
| 1341 |
+
Default is cheap — ``stack_depth`` is ``null`` in every entry so
|
| 1342 |
+
a storm snapshot does not walk 50+ coroutine frames synchronously.
|
| 1343 |
+
Pass ``?stack=true`` to compute ``stack_depth`` for each task
|
| 1344 |
+
(useful for single-shot human debugging).
|
| 1345 |
+
"""
|
| 1346 |
ws_registry = getattr(proxy, "ws_sessions", None)
|
| 1347 |
+
return JSONResponse(
|
| 1348 |
+
status_code=200,
|
| 1349 |
+
content=_collect_tasks(ws_registry, with_stack_depth=stack),
|
| 1350 |
+
)
|
| 1351 |
|
| 1352 |
@app.get("/debug/ws-sessions", dependencies=[Depends(_require_loopback)])
|
| 1353 |
async def debug_ws_sessions():
|
tests/test_proxy_debug_endpoints.py
CHANGED
|
@@ -256,6 +256,37 @@ def test_debug_tasks_returns_json_array_for_loopback(client):
|
|
| 256 |
assert "coro_qualname" in entry
|
| 257 |
|
| 258 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
def test_debug_warmup_reports_registry_slots(client):
|
| 260 |
response = client.get("/debug/warmup")
|
| 261 |
assert response.status_code == 200
|
|
|
|
| 256 |
assert "coro_qualname" in entry
|
| 257 |
|
| 258 |
|
| 259 |
+
def test_debug_tasks_stack_depth_is_gated_behind_query(client):
|
| 260 |
+
"""Default response must not compute stack_depth (P3 Fix 29 perf gate).
|
| 261 |
+
|
| 262 |
+
``?stack=true`` opts into the synchronous ``Task.get_stack`` walk; the
|
| 263 |
+
default stays cheap so snapshotting during a reconnect storm does
|
| 264 |
+
not stall the event loop.
|
| 265 |
+
"""
|
| 266 |
+
default = client.get("/debug/tasks")
|
| 267 |
+
assert default.status_code == 200
|
| 268 |
+
for entry in default.json():
|
| 269 |
+
assert entry["stack_depth"] is None, (
|
| 270 |
+
f"default /debug/tasks must not compute stack_depth; "
|
| 271 |
+
f"got {entry['stack_depth']!r} for {entry.get('name')!r}"
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
with_stack = client.get("/debug/tasks?stack=true")
|
| 275 |
+
assert with_stack.status_code == 200
|
| 276 |
+
entries = with_stack.json()
|
| 277 |
+
# At least one entry should have a computed depth (the TestClient
|
| 278 |
+
# itself runs under a task). Some entries may still be None if
|
| 279 |
+
# get_stack raised defensively — we only require that opting in
|
| 280 |
+
# produces at least one integer result.
|
| 281 |
+
integer_depths = [
|
| 282 |
+
e["stack_depth"] for e in entries if isinstance(e["stack_depth"], int)
|
| 283 |
+
]
|
| 284 |
+
assert integer_depths, (
|
| 285 |
+
"expected at least one int stack_depth when ?stack=true; "
|
| 286 |
+
f"got entries={entries!r}"
|
| 287 |
+
)
|
| 288 |
+
|
| 289 |
+
|
| 290 |
def test_debug_warmup_reports_registry_slots(client):
|
| 291 |
response = client.get("/debug/warmup")
|
| 292 |
assert response.status_code == 200
|