Spaces:
Runtime error
Runtime error
| """STEP C — per-feature dedicated prompts + self-consistency. Compares to the A+B best. | |
| C1: Input Quality as TWO single-feature binary prompts (goal_stated solo, specific_specification solo), | |
| removing any trade-off the combined prompt forced. Deterministic, cached. | |
| C2: Self-consistency for Interaction — sample the v3 prompt 3× at temperature 0.6, majority vote | |
| (variance reduction on the noisy axis). Uses the vLLM `n` param (one request → 3 completions). | |
| Run: python -m eval.step_c | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| import sys | |
| import requests | |
| from eval import kappa as K | |
| from eval.prompts_v2 import _ask | |
| from eval.prompts_v3 import build_interaction_prompt as int_v3 | |
| _GOAL_T = '{"goal_stated": <bool>}' | |
| _SPEC_T = '{"specific_specification": <bool>}' | |
| def goal_prompt(msg): | |
| instr = ( | |
| "Does THIS user message state a PURPOSE / use / audience / reason (a WHY) for the request — not just " | |
| "the task itself? TRUE only if a why/use/audience is present (stated or clearly implied).\n" | |
| " ▸ TRUE: 'summarize for my class', 'I'm building a budgeting app, write the schema', 'dedupe my data " | |
| "for analysis'. ✗ FALSE (task only): 'write a sorting function', 'fix this code', 'list 10 names'." | |
| ) | |
| return _ask(instr, _GOAL_T, "User message", msg) | |
| _DECOMP_T = '{"decomposition": <bool>}' | |
| def decomp_prompt(msg): | |
| instr = ( | |
| "Does THIS user message break the REQUEST into multiple ordered/numbered SUB-TASKS the AI must " | |
| "perform? TRUE for 'first X, then Y, then Z' or '1) … 2) … 3) …' of distinct sub-tasks.\n" | |
| " ▸ TRUE: 'First summarize it, then list 3 risks, then propose fixes', '1) clean the data 2) plot it'.\n" | |
| " ✗ FALSE: a single ask ('write a sorting function'); 'write a step-by-step guide' (output, not a " | |
| "decomposed request); 'think step by step' (that's reasoning); a numbered list of data/options." | |
| ) | |
| return _ask(instr, _DECOMP_T, "User message", msg) | |
| def spec_prompt(msg): | |
| instr = ( | |
| "Does THIS user message contain ANY concrete detail of ANY kind — a number, version, named " | |
| "tool/library/framework, file/data/schema, an explicit constraint, OR the user's OWN " | |
| "background/role/skill level? Be LIBERAL: a single concrete detail is enough.\n" | |
| " ▸ TRUE: 'Python 3.11', 'I'm a beginner', 'under 100ms', 'here's my CSV', 'using pandas', " | |
| "'a 1000-word post'. ✗ FALSE only when fully vague: 'make it fast', 'help me', 'write something good'." | |
| ) | |
| return _ask(instr, _SPEC_T, "User message", msg) | |
| class SamplingClient: | |
| """vLLM OpenAI-style endpoint with n completions per request (for self-consistency).""" | |
| def __init__(self, base_url, token, timeout=90): | |
| self.base = base_url.rstrip("/"); self.token = token; self.timeout = timeout | |
| def sample(self, prompt, n=3, temperature=0.6): | |
| body = {"model": "MiniCPM4.1-8B", "messages": [{"role": "user", "content": prompt}], | |
| "temperature": temperature, "n": n, "max_tokens": 256, | |
| "chat_template_kwargs": {"enable_thinking": False}} | |
| r = requests.post(f"{self.base}/v1/chat/completions", | |
| headers={"Authorization": f"Bearer {self.token}", "Content-Type": "application/json"}, | |
| json=body, timeout=self.timeout) | |
| r.raise_for_status() | |
| from prompt_card.llm.minicpm import clean_content | |
| return [clean_content(c["message"]["content"]) for c in r.json()["choices"]] | |
| def main(): | |
| base = os.environ.get("OPENBMB_BASE_URL"); token = os.environ.get("OPENBMB_TOKEN") | |
| if not base or not token: | |
| print("ERROR: set OPENBMB_BASE_URL/OPENBMB_TOKEN", file=sys.stderr); sys.exit(2) | |
| from prompt_card.llm.minicpm import MiniCPMClient | |
| gt = K.load_gt(); convs = K.load_convs() | |
| client = K.CachedClient(MiniCPMClient(base, token), workers=8) | |
| # ---- C1: per-feature IQ ---- | |
| goal_prompts, spec_prompts, rows = [], [], [] | |
| for r in gt: | |
| ut = K.user_turns(convs[r["id"]]) | |
| for row in r["input_quality"]: | |
| i = int(row["turn"][1:]) - 1 | |
| goal_prompts.append(goal_prompt(ut[i])); spec_prompts.append(spec_prompt(ut[i])) | |
| rows.append((int("goal_stated" in row["features"]), int("specific_specification" in row["features"]))) | |
| gres = client.run_all(goal_prompts); sres = client.run_all(spec_prompts) | |
| gyt = [r[0] for r in rows]; gyp = [int(bool((K.OA.parse(gres[p], ("goal_stated",)) or {}).get("goal_stated"))) for p in goal_prompts] | |
| syt = [r[1] for r in rows]; syp = [int(bool((K.OA.parse(sres[p], ("specific_specification",)) or {}).get("specific_specification"))) for p in spec_prompts] | |
| gk = K.cohen_kappa(gyt, gyp); sk = K.cohen_kappa(syt, syp) | |
| print("=== C1 per-feature IQ (dedicated single-feature prompts) ===") | |
| print(f" goal_stated κ={gk:+.3f} [TN/FP/FN/TP {K.binary_counts(gyt,gyp)}] (A+B best v2: +0.226)") | |
| print(f" specific_specification κ={sk:+.3f} [TN/FP/FN/TP {K.binary_counts(syt,syp)}] (A+B best v2: +0.568)") | |
| print(f" IQ headline (mean) κ={(gk+sk)/2:+.3f} (A+B best: +0.397)") | |
| # ---- C2: self-consistency interaction ---- | |
| sc = SamplingClient(base, token) | |
| iyt, iyp = [], [] | |
| items = [] | |
| for r in gt: | |
| ut = K.user_turns(convs[r["id"]]) | |
| for row in r["interaction"]: | |
| i = int(row["turn"][1:]) - 1 | |
| items.append((int(bool(row["refinement"])), int_v3(ut[i - 1], ut[i]))) | |
| print(f"\n=== C2 self-consistency interaction (3×, temp 0.6, majority) over {len(items)} turns ===", flush=True) | |
| import concurrent.futures as cf | |
| cache = {} | |
| def vote(prompt): | |
| outs = sc.sample(prompt, n=3, temperature=0.6) | |
| yes = 0 | |
| for o in outs: | |
| d = K.OA.parse(o, ("refinement_attempt",)) or {} | |
| yes += int(bool(d.get("refinement_attempt"))) | |
| return int(yes >= 2) | |
| prompts = [p for _, p in items] | |
| with cf.ThreadPoolExecutor(max_workers=8) as ex: | |
| votes = list(ex.map(vote, prompts)) | |
| iyt = [t for t, _ in items]; iyp = votes | |
| ik = K.cohen_kappa(iyt, iyp) | |
| print(f" interaction (self-consistency) κ={ik:+.3f} [TN/FP/FN/TP {K.binary_counts(iyt,iyp)}] (A+B best v3: +0.320)") | |
| json.dump({"goal": gk, "spec": sk, "iq_mean": (gk + sk) / 2, "interaction_sc": ik}, | |
| open(os.path.join(os.path.dirname(__file__), "_cache", "step_c.json"), "w"), indent=1) | |
| if __name__ == "__main__": | |
| main() | |