"""STEP C — per-feature dedicated prompts + self-consistency. Compares to the A+B best.

C1: Input Quality as TWO single-feature binary prompts (goal_stated solo, specific_specification solo),
    removing any trade-off the combined prompt forced. Deterministic, cached.
C2: Self-consistency for Interaction — sample the v3 prompt 3× at temperature 0.6, majority vote
    (variance reduction on the noisy axis). Uses the vLLM `n` param (one request → 3 completions).

Run:  python -m eval.step_c
"""
from __future__ import annotations

import json
import os
import sys

import requests

from eval import kappa as K
from eval.prompts_v2 import _ask
from eval.prompts_v3 import build_interaction_prompt as int_v3

_GOAL_T = '{"goal_stated": <bool>}'
_SPEC_T = '{"specific_specification": <bool>}'


def goal_prompt(msg):
    instr = (
        "Does THIS user message state a PURPOSE / use / audience / reason (a WHY) for the request — not just "
        "the task itself? TRUE only if a why/use/audience is present (stated or clearly implied).\n"
        "  ▸ TRUE: 'summarize for my class', 'I'm building a budgeting app, write the schema', 'dedupe my data "
        "for analysis'.  ✗ FALSE (task only): 'write a sorting function', 'fix this code', 'list 10 names'."
    )
    return _ask(instr, _GOAL_T, "User message", msg)


_DECOMP_T = '{"decomposition": <bool>}'


def decomp_prompt(msg):
    instr = (
        "Does THIS user message break the REQUEST into multiple ordered/numbered SUB-TASKS the AI must "
        "perform? TRUE for 'first X, then Y, then Z' or '1) … 2) … 3) …' of distinct sub-tasks.\n"
        "  ▸ TRUE: 'First summarize it, then list 3 risks, then propose fixes', '1) clean the data 2) plot it'.\n"
        "  ✗ FALSE: a single ask ('write a sorting function'); 'write a step-by-step guide' (output, not a "
        "decomposed request); 'think step by step' (that's reasoning); a numbered list of data/options."
    )
    return _ask(instr, _DECOMP_T, "User message", msg)


def spec_prompt(msg):
    instr = (
        "Does THIS user message contain ANY concrete detail of ANY kind — a number, version, named "
        "tool/library/framework, file/data/schema, an explicit constraint, OR the user's OWN "
        "background/role/skill level? Be LIBERAL: a single concrete detail is enough.\n"
        "  ▸ TRUE: 'Python 3.11', 'I'm a beginner', 'under 100ms', 'here's my CSV', 'using pandas', "
        "'a 1000-word post'.  ✗ FALSE only when fully vague: 'make it fast', 'help me', 'write something good'."
    )
    return _ask(instr, _SPEC_T, "User message", msg)


class SamplingClient:
    """vLLM OpenAI-style endpoint with n completions per request (for self-consistency)."""

    def __init__(self, base_url, token, timeout=90):
        self.base = base_url.rstrip("/"); self.token = token; self.timeout = timeout

    def sample(self, prompt, n=3, temperature=0.6):
        body = {"model": "MiniCPM4.1-8B", "messages": [{"role": "user", "content": prompt}],
                "temperature": temperature, "n": n, "max_tokens": 256,
                "chat_template_kwargs": {"enable_thinking": False}}
        r = requests.post(f"{self.base}/v1/chat/completions",
                          headers={"Authorization": f"Bearer {self.token}", "Content-Type": "application/json"},
                          json=body, timeout=self.timeout)
        r.raise_for_status()
        from prompt_card.llm.minicpm import clean_content
        return [clean_content(c["message"]["content"]) for c in r.json()["choices"]]


def main():
    base = os.environ.get("OPENBMB_BASE_URL"); token = os.environ.get("OPENBMB_TOKEN")
    if not base or not token:
        print("ERROR: set OPENBMB_BASE_URL/OPENBMB_TOKEN", file=sys.stderr); sys.exit(2)
    from prompt_card.llm.minicpm import MiniCPMClient
    gt = K.load_gt(); convs = K.load_convs()
    client = K.CachedClient(MiniCPMClient(base, token), workers=8)

    # ---- C1: per-feature IQ ----
    goal_prompts, spec_prompts, rows = [], [], []
    for r in gt:
        ut = K.user_turns(convs[r["id"]])
        for row in r["input_quality"]:
            i = int(row["turn"][1:]) - 1
            goal_prompts.append(goal_prompt(ut[i])); spec_prompts.append(spec_prompt(ut[i]))
            rows.append((int("goal_stated" in row["features"]), int("specific_specification" in row["features"])))
    gres = client.run_all(goal_prompts); sres = client.run_all(spec_prompts)
    gyt = [r[0] for r in rows]; gyp = [int(bool((K.OA.parse(gres[p], ("goal_stated",)) or {}).get("goal_stated"))) for p in goal_prompts]
    syt = [r[1] for r in rows]; syp = [int(bool((K.OA.parse(sres[p], ("specific_specification",)) or {}).get("specific_specification"))) for p in spec_prompts]
    gk = K.cohen_kappa(gyt, gyp); sk = K.cohen_kappa(syt, syp)
    print("=== C1 per-feature IQ (dedicated single-feature prompts) ===")
    print(f"  goal_stated           κ={gk:+.3f}  [TN/FP/FN/TP {K.binary_counts(gyt,gyp)}]   (A+B best v2: +0.226)")
    print(f"  specific_specification κ={sk:+.3f}  [TN/FP/FN/TP {K.binary_counts(syt,syp)}]   (A+B best v2: +0.568)")
    print(f"  IQ headline (mean)    κ={(gk+sk)/2:+.3f}   (A+B best: +0.397)")

    # ---- C2: self-consistency interaction ----
    sc = SamplingClient(base, token)
    iyt, iyp = [], []
    items = []
    for r in gt:
        ut = K.user_turns(convs[r["id"]])
        for row in r["interaction"]:
            i = int(row["turn"][1:]) - 1
            items.append((int(bool(row["refinement"])), int_v3(ut[i - 1], ut[i])))
    print(f"\n=== C2 self-consistency interaction (3×, temp 0.6, majority) over {len(items)} turns ===", flush=True)
    import concurrent.futures as cf
    cache = {}

    def vote(prompt):
        outs = sc.sample(prompt, n=3, temperature=0.6)
        yes = 0
        for o in outs:
            d = K.OA.parse(o, ("refinement_attempt",)) or {}
            yes += int(bool(d.get("refinement_attempt")))
        return int(yes >= 2)

    prompts = [p for _, p in items]
    with cf.ThreadPoolExecutor(max_workers=8) as ex:
        votes = list(ex.map(vote, prompts))
    iyt = [t for t, _ in items]; iyp = votes
    ik = K.cohen_kappa(iyt, iyp)
    print(f"  interaction (self-consistency) κ={ik:+.3f}  [TN/FP/FN/TP {K.binary_counts(iyt,iyp)}]   (A+B best v3: +0.320)")

    json.dump({"goal": gk, "spec": sk, "iq_mean": (gk + sk) / 2, "interaction_sc": ik},
              open(os.path.join(os.path.dirname(__file__), "_cache", "step_c.json"), "w"), indent=1)


if __name__ == "__main__":
    main()