Joshua Sundance Bailey
loosecanvas: local AI thought-mapping canvas with a trust-tagged knowledge graph
6d1438c | """Throwaway latency probe for the M04b text->graph call path. | |
| Measures where time goes in a schema-constrained extraction call against the live | |
| llama.cpp server, isolating: (a) constrained vs unconstrained generation, (b) the | |
| effect of ``max_tokens``, and (c) chunk size / call-count scaling. Prints the | |
| server's own ``timings`` (prompt/predicted tokens-per-second) so we tune from data, | |
| not guesswork. Not a unit test; safe to delete. | |
| """ | |
| from __future__ import annotations | |
| import asyncio | |
| import time | |
| from typing import Any | |
| import httpx | |
| from loosecanvas.extractors.text_graph_adapter import ( | |
| _SYSTEM_PROMPT, | |
| TEXT_GRAPH_RESPONSE_FORMAT, | |
| _chunk_text, | |
| ) | |
| BASE = "http://127.0.0.1:8080" | |
| SHORT = ( | |
| "Photosynthesis is the process by which plants convert sunlight into chemical " | |
| "energy. Chlorophyll absorbs light, and the Calvin cycle fixes carbon dioxide " | |
| "into glucose. Oxygen is released as a byproduct." | |
| ) | |
| # ~6 KB of prose to exercise multi-chunk behaviour. | |
| LONG = (SHORT + " ") * 25 | |
| async def _raw_call( | |
| text: str, *, max_tokens: int, constrained: bool, temperature: float = 1.0 | |
| ) -> dict[str, Any]: | |
| body: dict[str, Any] = { | |
| "messages": [ | |
| {"role": "system", "content": _SYSTEM_PROMPT}, | |
| {"role": "user", "content": text}, | |
| ], | |
| "max_tokens": max_tokens, | |
| "temperature": temperature, | |
| "top_p": 0.95, | |
| "top_k": 64, | |
| "stop": ["<turn|>"], | |
| "chat_template_kwargs": {"enable_thinking": False}, | |
| "timings_per_token": True, | |
| } | |
| if constrained: | |
| body["response_format"] = TEXT_GRAPH_RESPONSE_FORMAT | |
| async with httpx.AsyncClient(timeout=300.0) as client: | |
| t0 = time.perf_counter() | |
| resp = await client.post(f"{BASE}/v1/chat/completions", json=body) | |
| wall = time.perf_counter() - t0 | |
| resp.raise_for_status() | |
| data = resp.json() | |
| return {"wall": wall, "data": data} | |
| def _summarize(label: str, result: dict[str, Any]) -> None: | |
| data = result["data"] | |
| usage = data.get("usage", {}) | |
| timings = data.get("timings", {}) | |
| content = data["choices"][0]["message"]["content"] | |
| finish = data["choices"][0].get("finish_reason") | |
| print(f"\n=== {label} ===") | |
| print(f" wall: {result['wall']:.2f}s") | |
| print(f" finish_reason: {finish}") | |
| print( | |
| f" tokens: prompt={usage.get('prompt_tokens')} " | |
| f"completion={usage.get('completion_tokens')}" | |
| ) | |
| if timings: | |
| print( | |
| f" prompt t/s: {timings.get('prompt_per_second', 0):.1f} " | |
| f"(prompt_ms={timings.get('prompt_ms', 0):.0f})" | |
| ) | |
| print( | |
| f" predict t/s: {timings.get('predicted_per_second', 0):.1f} " | |
| f"(predicted_ms={timings.get('predicted_ms', 0):.0f})" | |
| ) | |
| print(f" content chars: {len(content)}") | |
| async def main() -> None: | |
| # Readiness wait. | |
| async with httpx.AsyncClient(timeout=5.0) as client: | |
| for _ in range(120): | |
| try: | |
| h = await client.get(f"{BASE}/health") | |
| if h.is_success: | |
| break | |
| except httpx.HTTPError: | |
| pass | |
| await asyncio.sleep(2.0) | |
| print(f"SHORT chunks at 2000: {len(_chunk_text(SHORT))}") | |
| print(f"LONG chunks at 2000: {len(_chunk_text(LONG))}") | |
| print( | |
| f"LONG chunks at 8000: {len(_chunk_text(LONG, max_chars=8000, overlap=200))}" | |
| ) | |
| _summarize( | |
| "constrained, max_tokens=900 (CURRENT adapter setting)", | |
| await _raw_call(SHORT, max_tokens=900, constrained=True), | |
| ) | |
| _summarize( | |
| "constrained, max_tokens=1800", | |
| await _raw_call(SHORT, max_tokens=1800, constrained=True), | |
| ) | |
| _summarize( | |
| "UNconstrained, max_tokens=900 (no json_schema grammar)", | |
| await _raw_call(SHORT, max_tokens=900, constrained=False), | |
| ) | |
| _summarize( | |
| "constrained on LONG ~6KB, max_tokens=1800", | |
| await _raw_call(LONG, max_tokens=1800, constrained=True), | |
| ) | |
| # Proposed new settings: temp 0.3 (focused), max_tokens 2000 (no truncation). | |
| _summarize( | |
| "PROPOSED: SHORT temp=0.3, max_tokens=2000", | |
| await _raw_call(SHORT, max_tokens=2000, constrained=True, temperature=0.3), | |
| ) | |
| _summarize( | |
| "PROPOSED: LONG-as-1-chunk temp=0.3, max_tokens=2000", | |
| await _raw_call(LONG, max_tokens=2000, constrained=True, temperature=0.3), | |
| ) | |
| _summarize( | |
| "PROPOSED x2 (variance check): SHORT temp=0.3, max_tokens=2000", | |
| await _raw_call(SHORT, max_tokens=2000, constrained=True, temperature=0.3), | |
| ) | |
| if __name__ == "__main__": | |
| asyncio.run(main()) | |