Spaces:

build-small-hackathon
/

loosecanvas

Running

loosecanvas / scripts /text_graph_latency_probe.py

Joshua Sundance Bailey

loosecanvas: local AI thought-mapping canvas with a trust-tagged knowledge graph

6d1438c 19 days ago

4.67 kB

	"""Throwaway latency probe for the M04b text->graph call path.

	Measures where time goes in a schema-constrained extraction call against the live
	llama.cpp server, isolating: (a) constrained vs unconstrained generation, (b) the
	effect of ``max_tokens``, and (c) chunk size / call-count scaling. Prints the
	server's own ``timings`` (prompt/predicted tokens-per-second) so we tune from data,
	not guesswork. Not a unit test; safe to delete.
	"""

	from __future__ import annotations

	import asyncio
	import time
	from typing import Any

	import httpx
	from loosecanvas.extractors.text_graph_adapter import (
	_SYSTEM_PROMPT,
	TEXT_GRAPH_RESPONSE_FORMAT,
	_chunk_text,
	)

	BASE = "http://127.0.0.1:8080"

	SHORT = (
	"Photosynthesis is the process by which plants convert sunlight into chemical "
	"energy. Chlorophyll absorbs light, and the Calvin cycle fixes carbon dioxide "
	"into glucose. Oxygen is released as a byproduct."
	)

	# ~6 KB of prose to exercise multi-chunk behaviour.
	LONG = (SHORT + " ") * 25


	async def _raw_call(
	text: str, *, max_tokens: int, constrained: bool, temperature: float = 1.0
	) -> dict[str, Any]:
	body: dict[str, Any] = {
	"messages": [
	{"role": "system", "content": _SYSTEM_PROMPT},
	{"role": "user", "content": text},
	],
	"max_tokens": max_tokens,
	"temperature": temperature,
	"top_p": 0.95,
	"top_k": 64,
	"stop": ["<turn\|>"],
	"chat_template_kwargs": {"enable_thinking": False},
	"timings_per_token": True,
	}
	if constrained:
	body["response_format"] = TEXT_GRAPH_RESPONSE_FORMAT
	async with httpx.AsyncClient(timeout=300.0) as client:
	t0 = time.perf_counter()
	resp = await client.post(f"{BASE}/v1/chat/completions", json=body)
	wall = time.perf_counter() - t0
	resp.raise_for_status()
	data = resp.json()
	return {"wall": wall, "data": data}


	def _summarize(label: str, result: dict[str, Any]) -> None:
	data = result["data"]
	usage = data.get("usage", {})
	timings = data.get("timings", {})
	content = data["choices"][0]["message"]["content"]
	finish = data["choices"][0].get("finish_reason")
	print(f"\n=== {label} ===")
	print(f" wall: {result['wall']:.2f}s")
	print(f" finish_reason: {finish}")
	print(
	f" tokens: prompt={usage.get('prompt_tokens')} "
	f"completion={usage.get('completion_tokens')}"
	)
	if timings:
	print(
	f" prompt t/s: {timings.get('prompt_per_second', 0):.1f} "
	f"(prompt_ms={timings.get('prompt_ms', 0):.0f})"
	)
	print(
	f" predict t/s: {timings.get('predicted_per_second', 0):.1f} "
	f"(predicted_ms={timings.get('predicted_ms', 0):.0f})"
	)
	print(f" content chars: {len(content)}")


	async def main() -> None:
	# Readiness wait.
	async with httpx.AsyncClient(timeout=5.0) as client:
	for _ in range(120):
	try:
	h = await client.get(f"{BASE}/health")
	if h.is_success:
	break
	except httpx.HTTPError:
	pass
	await asyncio.sleep(2.0)

	print(f"SHORT chunks at 2000: {len(_chunk_text(SHORT))}")
	print(f"LONG chunks at 2000: {len(_chunk_text(LONG))}")
	print(
	f"LONG chunks at 8000: {len(_chunk_text(LONG, max_chars=8000, overlap=200))}"
	)

	_summarize(
	"constrained, max_tokens=900 (CURRENT adapter setting)",
	await _raw_call(SHORT, max_tokens=900, constrained=True),
	)
	_summarize(
	"constrained, max_tokens=1800",
	await _raw_call(SHORT, max_tokens=1800, constrained=True),
	)
	_summarize(
	"UNconstrained, max_tokens=900 (no json_schema grammar)",
	await _raw_call(SHORT, max_tokens=900, constrained=False),
	)
	_summarize(
	"constrained on LONG ~6KB, max_tokens=1800",
	await _raw_call(LONG, max_tokens=1800, constrained=True),
	)
	# Proposed new settings: temp 0.3 (focused), max_tokens 2000 (no truncation).
	_summarize(
	"PROPOSED: SHORT temp=0.3, max_tokens=2000",
	await _raw_call(SHORT, max_tokens=2000, constrained=True, temperature=0.3),
	)
	_summarize(
	"PROPOSED: LONG-as-1-chunk temp=0.3, max_tokens=2000",
	await _raw_call(LONG, max_tokens=2000, constrained=True, temperature=0.3),
	)
	_summarize(
	"PROPOSED x2 (variance check): SHORT temp=0.3, max_tokens=2000",
	await _raw_call(SHORT, max_tokens=2000, constrained=True, temperature=0.3),
	)


	if __name__ == "__main__":
	asyncio.run(main())