"""Throwaway latency probe for the M04b text->graph call path. Measures where time goes in a schema-constrained extraction call against the live llama.cpp server, isolating: (a) constrained vs unconstrained generation, (b) the effect of ``max_tokens``, and (c) chunk size / call-count scaling. Prints the server's own ``timings`` (prompt/predicted tokens-per-second) so we tune from data, not guesswork. Not a unit test; safe to delete. """ from __future__ import annotations import asyncio import time from typing import Any import httpx from loosecanvas.extractors.text_graph_adapter import ( _SYSTEM_PROMPT, TEXT_GRAPH_RESPONSE_FORMAT, _chunk_text, ) BASE = "http://127.0.0.1:8080" SHORT = ( "Photosynthesis is the process by which plants convert sunlight into chemical " "energy. Chlorophyll absorbs light, and the Calvin cycle fixes carbon dioxide " "into glucose. Oxygen is released as a byproduct." ) # ~6 KB of prose to exercise multi-chunk behaviour. LONG = (SHORT + " ") * 25 async def _raw_call( text: str, *, max_tokens: int, constrained: bool, temperature: float = 1.0 ) -> dict[str, Any]: body: dict[str, Any] = { "messages": [ {"role": "system", "content": _SYSTEM_PROMPT}, {"role": "user", "content": text}, ], "max_tokens": max_tokens, "temperature": temperature, "top_p": 0.95, "top_k": 64, "stop": [""], "chat_template_kwargs": {"enable_thinking": False}, "timings_per_token": True, } if constrained: body["response_format"] = TEXT_GRAPH_RESPONSE_FORMAT async with httpx.AsyncClient(timeout=300.0) as client: t0 = time.perf_counter() resp = await client.post(f"{BASE}/v1/chat/completions", json=body) wall = time.perf_counter() - t0 resp.raise_for_status() data = resp.json() return {"wall": wall, "data": data} def _summarize(label: str, result: dict[str, Any]) -> None: data = result["data"] usage = data.get("usage", {}) timings = data.get("timings", {}) content = data["choices"][0]["message"]["content"] finish = data["choices"][0].get("finish_reason") print(f"\n=== {label} ===") print(f" wall: {result['wall']:.2f}s") print(f" finish_reason: {finish}") print( f" tokens: prompt={usage.get('prompt_tokens')} " f"completion={usage.get('completion_tokens')}" ) if timings: print( f" prompt t/s: {timings.get('prompt_per_second', 0):.1f} " f"(prompt_ms={timings.get('prompt_ms', 0):.0f})" ) print( f" predict t/s: {timings.get('predicted_per_second', 0):.1f} " f"(predicted_ms={timings.get('predicted_ms', 0):.0f})" ) print(f" content chars: {len(content)}") async def main() -> None: # Readiness wait. async with httpx.AsyncClient(timeout=5.0) as client: for _ in range(120): try: h = await client.get(f"{BASE}/health") if h.is_success: break except httpx.HTTPError: pass await asyncio.sleep(2.0) print(f"SHORT chunks at 2000: {len(_chunk_text(SHORT))}") print(f"LONG chunks at 2000: {len(_chunk_text(LONG))}") print( f"LONG chunks at 8000: {len(_chunk_text(LONG, max_chars=8000, overlap=200))}" ) _summarize( "constrained, max_tokens=900 (CURRENT adapter setting)", await _raw_call(SHORT, max_tokens=900, constrained=True), ) _summarize( "constrained, max_tokens=1800", await _raw_call(SHORT, max_tokens=1800, constrained=True), ) _summarize( "UNconstrained, max_tokens=900 (no json_schema grammar)", await _raw_call(SHORT, max_tokens=900, constrained=False), ) _summarize( "constrained on LONG ~6KB, max_tokens=1800", await _raw_call(LONG, max_tokens=1800, constrained=True), ) # Proposed new settings: temp 0.3 (focused), max_tokens 2000 (no truncation). _summarize( "PROPOSED: SHORT temp=0.3, max_tokens=2000", await _raw_call(SHORT, max_tokens=2000, constrained=True, temperature=0.3), ) _summarize( "PROPOSED: LONG-as-1-chunk temp=0.3, max_tokens=2000", await _raw_call(LONG, max_tokens=2000, constrained=True, temperature=0.3), ) _summarize( "PROPOSED x2 (variance check): SHORT temp=0.3, max_tokens=2000", await _raw_call(SHORT, max_tokens=2000, constrained=True, temperature=0.3), ) if __name__ == "__main__": asyncio.run(main())