"""Throwaway latency probe for the M04b text->graph call path.

Measures where time goes in a schema-constrained extraction call against the live
llama.cpp server, isolating: (a) constrained vs unconstrained generation, (b) the
effect of ``max_tokens``, and (c) chunk size / call-count scaling. Prints the
server's own ``timings`` (prompt/predicted tokens-per-second) so we tune from data,
not guesswork. Not a unit test; safe to delete.
"""

from __future__ import annotations

import asyncio
import time
from typing import Any

import httpx
from loosecanvas.extractors.text_graph_adapter import (
    _SYSTEM_PROMPT,
    TEXT_GRAPH_RESPONSE_FORMAT,
    _chunk_text,
)

BASE = "http://127.0.0.1:8080"

SHORT = (
    "Photosynthesis is the process by which plants convert sunlight into chemical "
    "energy. Chlorophyll absorbs light, and the Calvin cycle fixes carbon dioxide "
    "into glucose. Oxygen is released as a byproduct."
)

# ~6 KB of prose to exercise multi-chunk behaviour.
LONG = (SHORT + " ") * 25


async def _raw_call(
    text: str, *, max_tokens: int, constrained: bool, temperature: float = 1.0
) -> dict[str, Any]:
    body: dict[str, Any] = {
        "messages": [
            {"role": "system", "content": _SYSTEM_PROMPT},
            {"role": "user", "content": text},
        ],
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": 0.95,
        "top_k": 64,
        "stop": ["<turn|>"],
        "chat_template_kwargs": {"enable_thinking": False},
        "timings_per_token": True,
    }
    if constrained:
        body["response_format"] = TEXT_GRAPH_RESPONSE_FORMAT
    async with httpx.AsyncClient(timeout=300.0) as client:
        t0 = time.perf_counter()
        resp = await client.post(f"{BASE}/v1/chat/completions", json=body)
        wall = time.perf_counter() - t0
    resp.raise_for_status()
    data = resp.json()
    return {"wall": wall, "data": data}


def _summarize(label: str, result: dict[str, Any]) -> None:
    data = result["data"]
    usage = data.get("usage", {})
    timings = data.get("timings", {})
    content = data["choices"][0]["message"]["content"]
    finish = data["choices"][0].get("finish_reason")
    print(f"\n=== {label} ===")
    print(f"  wall:           {result['wall']:.2f}s")
    print(f"  finish_reason:  {finish}")
    print(
        f"  tokens:         prompt={usage.get('prompt_tokens')} "
        f"completion={usage.get('completion_tokens')}"
    )
    if timings:
        print(
            f"  prompt t/s:     {timings.get('prompt_per_second', 0):.1f}  "
            f"(prompt_ms={timings.get('prompt_ms', 0):.0f})"
        )
        print(
            f"  predict t/s:    {timings.get('predicted_per_second', 0):.1f}  "
            f"(predicted_ms={timings.get('predicted_ms', 0):.0f})"
        )
    print(f"  content chars:  {len(content)}")


async def main() -> None:
    # Readiness wait.
    async with httpx.AsyncClient(timeout=5.0) as client:
        for _ in range(120):
            try:
                h = await client.get(f"{BASE}/health")
                if h.is_success:
                    break
            except httpx.HTTPError:
                pass
            await asyncio.sleep(2.0)

    print(f"SHORT chunks at 2000: {len(_chunk_text(SHORT))}")
    print(f"LONG  chunks at 2000: {len(_chunk_text(LONG))}")
    print(
        f"LONG  chunks at 8000: {len(_chunk_text(LONG, max_chars=8000, overlap=200))}"
    )

    _summarize(
        "constrained, max_tokens=900 (CURRENT adapter setting)",
        await _raw_call(SHORT, max_tokens=900, constrained=True),
    )
    _summarize(
        "constrained, max_tokens=1800",
        await _raw_call(SHORT, max_tokens=1800, constrained=True),
    )
    _summarize(
        "UNconstrained, max_tokens=900 (no json_schema grammar)",
        await _raw_call(SHORT, max_tokens=900, constrained=False),
    )
    _summarize(
        "constrained on LONG ~6KB, max_tokens=1800",
        await _raw_call(LONG, max_tokens=1800, constrained=True),
    )
    # Proposed new settings: temp 0.3 (focused), max_tokens 2000 (no truncation).
    _summarize(
        "PROPOSED: SHORT temp=0.3, max_tokens=2000",
        await _raw_call(SHORT, max_tokens=2000, constrained=True, temperature=0.3),
    )
    _summarize(
        "PROPOSED: LONG-as-1-chunk temp=0.3, max_tokens=2000",
        await _raw_call(LONG, max_tokens=2000, constrained=True, temperature=0.3),
    )
    _summarize(
        "PROPOSED x2 (variance check): SHORT temp=0.3, max_tokens=2000",
        await _raw_call(SHORT, max_tokens=2000, constrained=True, temperature=0.3),
    )


if __name__ == "__main__":
    asyncio.run(main())