adryanev commited on
Commit
2aea52a
·
1 Parent(s): fc8b6ec

perf(proxy): gate /debug/tasks stack-depth computation behind query param

Browse files

Task.get_stack(limit=32) walks coroutine frames synchronously and
measurably stalls the event loop when called for 50+ relay tasks during
a reconnect storm. The /debug/tasks snapshot does not need this by
default — the perf-cheap fields (name, coro_qualname, age, done) are
enough for the common case of "which tasks are alive".

- collect_tasks: add with_stack_depth=False kwarg; default leaves
stack_depth=None per entry.
- /debug/tasks: read ?stack=true query parameter and forward it. Default
response is cheap; opt-in explicitly when human-debugging one snapshot.
- Test: verify default entries have stack_depth=None and that
?stack=true produces at least one integer depth.

headroom/proxy/debug_introspection.py CHANGED
@@ -113,6 +113,8 @@ def _age_for_named_task(
113
 
114
  def collect_tasks(
115
  ws_registry: WebSocketSessionRegistry | None = None,
 
 
116
  ) -> list[dict[str, Any]]:
117
  """Enumerate ``asyncio.all_tasks()`` for /debug/tasks.
118
 
@@ -121,6 +123,12 @@ def collect_tasks(
121
  and ``done``. Sorted by age descending with ``None`` ages sorted
122
  after known ages. System noise (``None`` tasks, tasks with no
123
  coroutine) is filtered out.
 
 
 
 
 
 
124
  """
125
  try:
126
  tasks = asyncio.all_tasks()
@@ -145,7 +153,7 @@ def collect_tasks(
145
  "name": name,
146
  "coro_qualname": qualname,
147
  "age_seconds": age,
148
- "stack_depth": _stack_depth(task),
149
  "done": bool(task.done()),
150
  }
151
  entries.append(entry)
 
113
 
114
  def collect_tasks(
115
  ws_registry: WebSocketSessionRegistry | None = None,
116
+ *,
117
+ with_stack_depth: bool = False,
118
  ) -> list[dict[str, Any]]:
119
  """Enumerate ``asyncio.all_tasks()`` for /debug/tasks.
120
 
 
123
  and ``done``. Sorted by age descending with ``None`` ages sorted
124
  after known ages. System noise (``None`` tasks, tasks with no
125
  coroutine) is filtered out.
126
+
127
+ ``stack_depth`` is only computed when ``with_stack_depth=True``
128
+ because :meth:`asyncio.Task.get_stack` walks coroutine frames and
129
+ can noticeably stall the event loop during a storm with 50+ relay
130
+ tasks. The default returns ``stack_depth=None``; callers that need
131
+ it (a human debugging one snapshot) can pass ``with_stack_depth=True``.
132
  """
133
  try:
134
  tasks = asyncio.all_tasks()
 
153
  "name": name,
154
  "coro_qualname": qualname,
155
  "age_seconds": age,
156
+ "stack_depth": _stack_depth(task) if with_stack_depth else None,
157
  "done": bool(task.done()),
158
  }
159
  entries.append(entry)
headroom/proxy/server.py CHANGED
@@ -1335,9 +1335,19 @@ def create_app(config: ProxyConfig | None = None) -> FastAPI:
1335
  from headroom.proxy.loopback_guard import require_loopback as _require_loopback
1336
 
1337
  @app.get("/debug/tasks", dependencies=[Depends(_require_loopback)])
1338
- async def debug_tasks():
 
 
 
 
 
 
 
1339
  ws_registry = getattr(proxy, "ws_sessions", None)
1340
- return JSONResponse(status_code=200, content=_collect_tasks(ws_registry))
 
 
 
1341
 
1342
  @app.get("/debug/ws-sessions", dependencies=[Depends(_require_loopback)])
1343
  async def debug_ws_sessions():
 
1335
  from headroom.proxy.loopback_guard import require_loopback as _require_loopback
1336
 
1337
  @app.get("/debug/tasks", dependencies=[Depends(_require_loopback)])
1338
+ async def debug_tasks(stack: bool = False):
1339
+ """Enumerate running asyncio tasks.
1340
+
1341
+ Default is cheap — ``stack_depth`` is ``null`` in every entry so
1342
+ a storm snapshot does not walk 50+ coroutine frames synchronously.
1343
+ Pass ``?stack=true`` to compute ``stack_depth`` for each task
1344
+ (useful for single-shot human debugging).
1345
+ """
1346
  ws_registry = getattr(proxy, "ws_sessions", None)
1347
+ return JSONResponse(
1348
+ status_code=200,
1349
+ content=_collect_tasks(ws_registry, with_stack_depth=stack),
1350
+ )
1351
 
1352
  @app.get("/debug/ws-sessions", dependencies=[Depends(_require_loopback)])
1353
  async def debug_ws_sessions():
tests/test_proxy_debug_endpoints.py CHANGED
@@ -256,6 +256,37 @@ def test_debug_tasks_returns_json_array_for_loopback(client):
256
  assert "coro_qualname" in entry
257
 
258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  def test_debug_warmup_reports_registry_slots(client):
260
  response = client.get("/debug/warmup")
261
  assert response.status_code == 200
 
256
  assert "coro_qualname" in entry
257
 
258
 
259
+ def test_debug_tasks_stack_depth_is_gated_behind_query(client):
260
+ """Default response must not compute stack_depth (P3 Fix 29 perf gate).
261
+
262
+ ``?stack=true`` opts into the synchronous ``Task.get_stack`` walk; the
263
+ default stays cheap so snapshotting during a reconnect storm does
264
+ not stall the event loop.
265
+ """
266
+ default = client.get("/debug/tasks")
267
+ assert default.status_code == 200
268
+ for entry in default.json():
269
+ assert entry["stack_depth"] is None, (
270
+ f"default /debug/tasks must not compute stack_depth; "
271
+ f"got {entry['stack_depth']!r} for {entry.get('name')!r}"
272
+ )
273
+
274
+ with_stack = client.get("/debug/tasks?stack=true")
275
+ assert with_stack.status_code == 200
276
+ entries = with_stack.json()
277
+ # At least one entry should have a computed depth (the TestClient
278
+ # itself runs under a task). Some entries may still be None if
279
+ # get_stack raised defensively — we only require that opting in
280
+ # produces at least one integer result.
281
+ integer_depths = [
282
+ e["stack_depth"] for e in entries if isinstance(e["stack_depth"], int)
283
+ ]
284
+ assert integer_depths, (
285
+ "expected at least one int stack_depth when ?stack=true; "
286
+ f"got entries={entries!r}"
287
+ )
288
+
289
+
290
  def test_debug_warmup_reports_registry_slots(client):
291
  response = client.get("/debug/warmup")
292
  assert response.status_code == 200