promptstat / eval /step_c.py
xxixx1028's picture
Deploy PromptStat — UI shell + MiniCPM4.1-8B + 4-LoRA hybrid (Modal)
dc9f530 verified
Raw
History Blame Contribute Delete
6.57 kB
"""STEP C — per-feature dedicated prompts + self-consistency. Compares to the A+B best.
C1: Input Quality as TWO single-feature binary prompts (goal_stated solo, specific_specification solo),
removing any trade-off the combined prompt forced. Deterministic, cached.
C2: Self-consistency for Interaction — sample the v3 prompt 3× at temperature 0.6, majority vote
(variance reduction on the noisy axis). Uses the vLLM `n` param (one request → 3 completions).
Run: python -m eval.step_c
"""
from __future__ import annotations
import json
import os
import sys
import requests
from eval import kappa as K
from eval.prompts_v2 import _ask
from eval.prompts_v3 import build_interaction_prompt as int_v3
_GOAL_T = '{"goal_stated": <bool>}'
_SPEC_T = '{"specific_specification": <bool>}'
def goal_prompt(msg):
instr = (
"Does THIS user message state a PURPOSE / use / audience / reason (a WHY) for the request — not just "
"the task itself? TRUE only if a why/use/audience is present (stated or clearly implied).\n"
" ▸ TRUE: 'summarize for my class', 'I'm building a budgeting app, write the schema', 'dedupe my data "
"for analysis'. ✗ FALSE (task only): 'write a sorting function', 'fix this code', 'list 10 names'."
)
return _ask(instr, _GOAL_T, "User message", msg)
_DECOMP_T = '{"decomposition": <bool>}'
def decomp_prompt(msg):
instr = (
"Does THIS user message break the REQUEST into multiple ordered/numbered SUB-TASKS the AI must "
"perform? TRUE for 'first X, then Y, then Z' or '1) … 2) … 3) …' of distinct sub-tasks.\n"
" ▸ TRUE: 'First summarize it, then list 3 risks, then propose fixes', '1) clean the data 2) plot it'.\n"
" ✗ FALSE: a single ask ('write a sorting function'); 'write a step-by-step guide' (output, not a "
"decomposed request); 'think step by step' (that's reasoning); a numbered list of data/options."
)
return _ask(instr, _DECOMP_T, "User message", msg)
def spec_prompt(msg):
instr = (
"Does THIS user message contain ANY concrete detail of ANY kind — a number, version, named "
"tool/library/framework, file/data/schema, an explicit constraint, OR the user's OWN "
"background/role/skill level? Be LIBERAL: a single concrete detail is enough.\n"
" ▸ TRUE: 'Python 3.11', 'I'm a beginner', 'under 100ms', 'here's my CSV', 'using pandas', "
"'a 1000-word post'. ✗ FALSE only when fully vague: 'make it fast', 'help me', 'write something good'."
)
return _ask(instr, _SPEC_T, "User message", msg)
class SamplingClient:
"""vLLM OpenAI-style endpoint with n completions per request (for self-consistency)."""
def __init__(self, base_url, token, timeout=90):
self.base = base_url.rstrip("/"); self.token = token; self.timeout = timeout
def sample(self, prompt, n=3, temperature=0.6):
body = {"model": "MiniCPM4.1-8B", "messages": [{"role": "user", "content": prompt}],
"temperature": temperature, "n": n, "max_tokens": 256,
"chat_template_kwargs": {"enable_thinking": False}}
r = requests.post(f"{self.base}/v1/chat/completions",
headers={"Authorization": f"Bearer {self.token}", "Content-Type": "application/json"},
json=body, timeout=self.timeout)
r.raise_for_status()
from prompt_card.llm.minicpm import clean_content
return [clean_content(c["message"]["content"]) for c in r.json()["choices"]]
def main():
base = os.environ.get("OPENBMB_BASE_URL"); token = os.environ.get("OPENBMB_TOKEN")
if not base or not token:
print("ERROR: set OPENBMB_BASE_URL/OPENBMB_TOKEN", file=sys.stderr); sys.exit(2)
from prompt_card.llm.minicpm import MiniCPMClient
gt = K.load_gt(); convs = K.load_convs()
client = K.CachedClient(MiniCPMClient(base, token), workers=8)
# ---- C1: per-feature IQ ----
goal_prompts, spec_prompts, rows = [], [], []
for r in gt:
ut = K.user_turns(convs[r["id"]])
for row in r["input_quality"]:
i = int(row["turn"][1:]) - 1
goal_prompts.append(goal_prompt(ut[i])); spec_prompts.append(spec_prompt(ut[i]))
rows.append((int("goal_stated" in row["features"]), int("specific_specification" in row["features"])))
gres = client.run_all(goal_prompts); sres = client.run_all(spec_prompts)
gyt = [r[0] for r in rows]; gyp = [int(bool((K.OA.parse(gres[p], ("goal_stated",)) or {}).get("goal_stated"))) for p in goal_prompts]
syt = [r[1] for r in rows]; syp = [int(bool((K.OA.parse(sres[p], ("specific_specification",)) or {}).get("specific_specification"))) for p in spec_prompts]
gk = K.cohen_kappa(gyt, gyp); sk = K.cohen_kappa(syt, syp)
print("=== C1 per-feature IQ (dedicated single-feature prompts) ===")
print(f" goal_stated κ={gk:+.3f} [TN/FP/FN/TP {K.binary_counts(gyt,gyp)}] (A+B best v2: +0.226)")
print(f" specific_specification κ={sk:+.3f} [TN/FP/FN/TP {K.binary_counts(syt,syp)}] (A+B best v2: +0.568)")
print(f" IQ headline (mean) κ={(gk+sk)/2:+.3f} (A+B best: +0.397)")
# ---- C2: self-consistency interaction ----
sc = SamplingClient(base, token)
iyt, iyp = [], []
items = []
for r in gt:
ut = K.user_turns(convs[r["id"]])
for row in r["interaction"]:
i = int(row["turn"][1:]) - 1
items.append((int(bool(row["refinement"])), int_v3(ut[i - 1], ut[i])))
print(f"\n=== C2 self-consistency interaction (3×, temp 0.6, majority) over {len(items)} turns ===", flush=True)
import concurrent.futures as cf
cache = {}
def vote(prompt):
outs = sc.sample(prompt, n=3, temperature=0.6)
yes = 0
for o in outs:
d = K.OA.parse(o, ("refinement_attempt",)) or {}
yes += int(bool(d.get("refinement_attempt")))
return int(yes >= 2)
prompts = [p for _, p in items]
with cf.ThreadPoolExecutor(max_workers=8) as ex:
votes = list(ex.map(vote, prompts))
iyt = [t for t, _ in items]; iyp = votes
ik = K.cohen_kappa(iyt, iyp)
print(f" interaction (self-consistency) κ={ik:+.3f} [TN/FP/FN/TP {K.binary_counts(iyt,iyp)}] (A+B best v3: +0.320)")
json.dump({"goal": gk, "spec": sk, "iq_mean": (gk + sk) / 2, "interaction_sc": ik},
open(os.path.join(os.path.dirname(__file__), "_cache", "step_c.json"), "w"), indent=1)
if __name__ == "__main__":
main()