loosecanvas / scripts /text_graph_latency_probe.py
Joshua Sundance Bailey
loosecanvas: local AI thought-mapping canvas with a trust-tagged knowledge graph
6d1438c
Raw
History Blame Contribute Delete
4.67 kB
"""Throwaway latency probe for the M04b text->graph call path.
Measures where time goes in a schema-constrained extraction call against the live
llama.cpp server, isolating: (a) constrained vs unconstrained generation, (b) the
effect of ``max_tokens``, and (c) chunk size / call-count scaling. Prints the
server's own ``timings`` (prompt/predicted tokens-per-second) so we tune from data,
not guesswork. Not a unit test; safe to delete.
"""
from __future__ import annotations
import asyncio
import time
from typing import Any
import httpx
from loosecanvas.extractors.text_graph_adapter import (
_SYSTEM_PROMPT,
TEXT_GRAPH_RESPONSE_FORMAT,
_chunk_text,
)
BASE = "http://127.0.0.1:8080"
SHORT = (
"Photosynthesis is the process by which plants convert sunlight into chemical "
"energy. Chlorophyll absorbs light, and the Calvin cycle fixes carbon dioxide "
"into glucose. Oxygen is released as a byproduct."
)
# ~6 KB of prose to exercise multi-chunk behaviour.
LONG = (SHORT + " ") * 25
async def _raw_call(
text: str, *, max_tokens: int, constrained: bool, temperature: float = 1.0
) -> dict[str, Any]:
body: dict[str, Any] = {
"messages": [
{"role": "system", "content": _SYSTEM_PROMPT},
{"role": "user", "content": text},
],
"max_tokens": max_tokens,
"temperature": temperature,
"top_p": 0.95,
"top_k": 64,
"stop": ["<turn|>"],
"chat_template_kwargs": {"enable_thinking": False},
"timings_per_token": True,
}
if constrained:
body["response_format"] = TEXT_GRAPH_RESPONSE_FORMAT
async with httpx.AsyncClient(timeout=300.0) as client:
t0 = time.perf_counter()
resp = await client.post(f"{BASE}/v1/chat/completions", json=body)
wall = time.perf_counter() - t0
resp.raise_for_status()
data = resp.json()
return {"wall": wall, "data": data}
def _summarize(label: str, result: dict[str, Any]) -> None:
data = result["data"]
usage = data.get("usage", {})
timings = data.get("timings", {})
content = data["choices"][0]["message"]["content"]
finish = data["choices"][0].get("finish_reason")
print(f"\n=== {label} ===")
print(f" wall: {result['wall']:.2f}s")
print(f" finish_reason: {finish}")
print(
f" tokens: prompt={usage.get('prompt_tokens')} "
f"completion={usage.get('completion_tokens')}"
)
if timings:
print(
f" prompt t/s: {timings.get('prompt_per_second', 0):.1f} "
f"(prompt_ms={timings.get('prompt_ms', 0):.0f})"
)
print(
f" predict t/s: {timings.get('predicted_per_second', 0):.1f} "
f"(predicted_ms={timings.get('predicted_ms', 0):.0f})"
)
print(f" content chars: {len(content)}")
async def main() -> None:
# Readiness wait.
async with httpx.AsyncClient(timeout=5.0) as client:
for _ in range(120):
try:
h = await client.get(f"{BASE}/health")
if h.is_success:
break
except httpx.HTTPError:
pass
await asyncio.sleep(2.0)
print(f"SHORT chunks at 2000: {len(_chunk_text(SHORT))}")
print(f"LONG chunks at 2000: {len(_chunk_text(LONG))}")
print(
f"LONG chunks at 8000: {len(_chunk_text(LONG, max_chars=8000, overlap=200))}"
)
_summarize(
"constrained, max_tokens=900 (CURRENT adapter setting)",
await _raw_call(SHORT, max_tokens=900, constrained=True),
)
_summarize(
"constrained, max_tokens=1800",
await _raw_call(SHORT, max_tokens=1800, constrained=True),
)
_summarize(
"UNconstrained, max_tokens=900 (no json_schema grammar)",
await _raw_call(SHORT, max_tokens=900, constrained=False),
)
_summarize(
"constrained on LONG ~6KB, max_tokens=1800",
await _raw_call(LONG, max_tokens=1800, constrained=True),
)
# Proposed new settings: temp 0.3 (focused), max_tokens 2000 (no truncation).
_summarize(
"PROPOSED: SHORT temp=0.3, max_tokens=2000",
await _raw_call(SHORT, max_tokens=2000, constrained=True, temperature=0.3),
)
_summarize(
"PROPOSED: LONG-as-1-chunk temp=0.3, max_tokens=2000",
await _raw_call(LONG, max_tokens=2000, constrained=True, temperature=0.3),
)
_summarize(
"PROPOSED x2 (variance check): SHORT temp=0.3, max_tokens=2000",
await _raw_call(SHORT, max_tokens=2000, constrained=True, temperature=0.3),
)
if __name__ == "__main__":
asyncio.run(main())