"""STEP C — per-feature dedicated prompts + self-consistency. Compares to the A+B best. C1: Input Quality as TWO single-feature binary prompts (goal_stated solo, specific_specification solo), removing any trade-off the combined prompt forced. Deterministic, cached. C2: Self-consistency for Interaction — sample the v3 prompt 3× at temperature 0.6, majority vote (variance reduction on the noisy axis). Uses the vLLM `n` param (one request → 3 completions). Run: python -m eval.step_c """ from __future__ import annotations import json import os import sys import requests from eval import kappa as K from eval.prompts_v2 import _ask from eval.prompts_v3 import build_interaction_prompt as int_v3 _GOAL_T = '{"goal_stated": }' _SPEC_T = '{"specific_specification": }' def goal_prompt(msg): instr = ( "Does THIS user message state a PURPOSE / use / audience / reason (a WHY) for the request — not just " "the task itself? TRUE only if a why/use/audience is present (stated or clearly implied).\n" " ▸ TRUE: 'summarize for my class', 'I'm building a budgeting app, write the schema', 'dedupe my data " "for analysis'. ✗ FALSE (task only): 'write a sorting function', 'fix this code', 'list 10 names'." ) return _ask(instr, _GOAL_T, "User message", msg) _DECOMP_T = '{"decomposition": }' def decomp_prompt(msg): instr = ( "Does THIS user message break the REQUEST into multiple ordered/numbered SUB-TASKS the AI must " "perform? TRUE for 'first X, then Y, then Z' or '1) … 2) … 3) …' of distinct sub-tasks.\n" " ▸ TRUE: 'First summarize it, then list 3 risks, then propose fixes', '1) clean the data 2) plot it'.\n" " ✗ FALSE: a single ask ('write a sorting function'); 'write a step-by-step guide' (output, not a " "decomposed request); 'think step by step' (that's reasoning); a numbered list of data/options." ) return _ask(instr, _DECOMP_T, "User message", msg) def spec_prompt(msg): instr = ( "Does THIS user message contain ANY concrete detail of ANY kind — a number, version, named " "tool/library/framework, file/data/schema, an explicit constraint, OR the user's OWN " "background/role/skill level? Be LIBERAL: a single concrete detail is enough.\n" " ▸ TRUE: 'Python 3.11', 'I'm a beginner', 'under 100ms', 'here's my CSV', 'using pandas', " "'a 1000-word post'. ✗ FALSE only when fully vague: 'make it fast', 'help me', 'write something good'." ) return _ask(instr, _SPEC_T, "User message", msg) class SamplingClient: """vLLM OpenAI-style endpoint with n completions per request (for self-consistency).""" def __init__(self, base_url, token, timeout=90): self.base = base_url.rstrip("/"); self.token = token; self.timeout = timeout def sample(self, prompt, n=3, temperature=0.6): body = {"model": "MiniCPM4.1-8B", "messages": [{"role": "user", "content": prompt}], "temperature": temperature, "n": n, "max_tokens": 256, "chat_template_kwargs": {"enable_thinking": False}} r = requests.post(f"{self.base}/v1/chat/completions", headers={"Authorization": f"Bearer {self.token}", "Content-Type": "application/json"}, json=body, timeout=self.timeout) r.raise_for_status() from prompt_card.llm.minicpm import clean_content return [clean_content(c["message"]["content"]) for c in r.json()["choices"]] def main(): base = os.environ.get("OPENBMB_BASE_URL"); token = os.environ.get("OPENBMB_TOKEN") if not base or not token: print("ERROR: set OPENBMB_BASE_URL/OPENBMB_TOKEN", file=sys.stderr); sys.exit(2) from prompt_card.llm.minicpm import MiniCPMClient gt = K.load_gt(); convs = K.load_convs() client = K.CachedClient(MiniCPMClient(base, token), workers=8) # ---- C1: per-feature IQ ---- goal_prompts, spec_prompts, rows = [], [], [] for r in gt: ut = K.user_turns(convs[r["id"]]) for row in r["input_quality"]: i = int(row["turn"][1:]) - 1 goal_prompts.append(goal_prompt(ut[i])); spec_prompts.append(spec_prompt(ut[i])) rows.append((int("goal_stated" in row["features"]), int("specific_specification" in row["features"]))) gres = client.run_all(goal_prompts); sres = client.run_all(spec_prompts) gyt = [r[0] for r in rows]; gyp = [int(bool((K.OA.parse(gres[p], ("goal_stated",)) or {}).get("goal_stated"))) for p in goal_prompts] syt = [r[1] for r in rows]; syp = [int(bool((K.OA.parse(sres[p], ("specific_specification",)) or {}).get("specific_specification"))) for p in spec_prompts] gk = K.cohen_kappa(gyt, gyp); sk = K.cohen_kappa(syt, syp) print("=== C1 per-feature IQ (dedicated single-feature prompts) ===") print(f" goal_stated κ={gk:+.3f} [TN/FP/FN/TP {K.binary_counts(gyt,gyp)}] (A+B best v2: +0.226)") print(f" specific_specification κ={sk:+.3f} [TN/FP/FN/TP {K.binary_counts(syt,syp)}] (A+B best v2: +0.568)") print(f" IQ headline (mean) κ={(gk+sk)/2:+.3f} (A+B best: +0.397)") # ---- C2: self-consistency interaction ---- sc = SamplingClient(base, token) iyt, iyp = [], [] items = [] for r in gt: ut = K.user_turns(convs[r["id"]]) for row in r["interaction"]: i = int(row["turn"][1:]) - 1 items.append((int(bool(row["refinement"])), int_v3(ut[i - 1], ut[i]))) print(f"\n=== C2 self-consistency interaction (3×, temp 0.6, majority) over {len(items)} turns ===", flush=True) import concurrent.futures as cf cache = {} def vote(prompt): outs = sc.sample(prompt, n=3, temperature=0.6) yes = 0 for o in outs: d = K.OA.parse(o, ("refinement_attempt",)) or {} yes += int(bool(d.get("refinement_attempt"))) return int(yes >= 2) prompts = [p for _, p in items] with cf.ThreadPoolExecutor(max_workers=8) as ex: votes = list(ex.map(vote, prompts)) iyt = [t for t, _ in items]; iyp = votes ik = K.cohen_kappa(iyt, iyp) print(f" interaction (self-consistency) κ={ik:+.3f} [TN/FP/FN/TP {K.binary_counts(iyt,iyp)}] (A+B best v3: +0.320)") json.dump({"goal": gk, "spec": sk, "iq_mean": (gk + sk) / 2, "interaction_sc": ik}, open(os.path.join(os.path.dirname(__file__), "_cache", "step_c.json"), "w"), indent=1) if __name__ == "__main__": main()