Spaces:

build-small-hackathon
/

promptstat

Runtime error

App Files Files Community

promptstat / eval /step_c.py

xxixx1028

Deploy PromptStat — UI shell + MiniCPM4.1-8B + 4-LoRA hybrid (Modal)

dc9f530 verified 18 days ago

Raw

History Blame Contribute Delete

6.57 kB

	"""STEP C — per-feature dedicated prompts + self-consistency. Compares to the A+B best.

	C1: Input Quality as TWO single-feature binary prompts (goal_stated solo, specific_specification solo),
	removing any trade-off the combined prompt forced. Deterministic, cached.
	C2: Self-consistency for Interaction — sample the v3 prompt 3× at temperature 0.6, majority vote
	(variance reduction on the noisy axis). Uses the vLLM `n` param (one request → 3 completions).

	Run: python -m eval.step_c
	"""
	from __future__ import annotations

	import json
	import os
	import sys

	import requests

	from eval import kappa as K
	from eval.prompts_v2 import _ask
	from eval.prompts_v3 import build_interaction_prompt as int_v3

	_GOAL_T = '{"goal_stated": <bool>}'
	_SPEC_T = '{"specific_specification": <bool>}'


	def goal_prompt(msg):
	instr = (
	"Does THIS user message state a PURPOSE / use / audience / reason (a WHY) for the request — not just "
	"the task itself? TRUE only if a why/use/audience is present (stated or clearly implied).\n"
	" ▸ TRUE: 'summarize for my class', 'I'm building a budgeting app, write the schema', 'dedupe my data "
	"for analysis'. ✗ FALSE (task only): 'write a sorting function', 'fix this code', 'list 10 names'."
	)
	return _ask(instr, _GOAL_T, "User message", msg)


	_DECOMP_T = '{"decomposition": <bool>}'


	def decomp_prompt(msg):
	instr = (
	"Does THIS user message break the REQUEST into multiple ordered/numbered SUB-TASKS the AI must "
	"perform? TRUE for 'first X, then Y, then Z' or '1) … 2) … 3) …' of distinct sub-tasks.\n"
	" ▸ TRUE: 'First summarize it, then list 3 risks, then propose fixes', '1) clean the data 2) plot it'.\n"
	" ✗ FALSE: a single ask ('write a sorting function'); 'write a step-by-step guide' (output, not a "
	"decomposed request); 'think step by step' (that's reasoning); a numbered list of data/options."
	)
	return _ask(instr, _DECOMP_T, "User message", msg)


	def spec_prompt(msg):
	instr = (
	"Does THIS user message contain ANY concrete detail of ANY kind — a number, version, named "
	"tool/library/framework, file/data/schema, an explicit constraint, OR the user's OWN "
	"background/role/skill level? Be LIBERAL: a single concrete detail is enough.\n"
	" ▸ TRUE: 'Python 3.11', 'I'm a beginner', 'under 100ms', 'here's my CSV', 'using pandas', "
	"'a 1000-word post'. ✗ FALSE only when fully vague: 'make it fast', 'help me', 'write something good'."
	)
	return _ask(instr, _SPEC_T, "User message", msg)


	class SamplingClient:
	"""vLLM OpenAI-style endpoint with n completions per request (for self-consistency)."""

	def __init__(self, base_url, token, timeout=90):
	self.base = base_url.rstrip("/"); self.token = token; self.timeout = timeout

	def sample(self, prompt, n=3, temperature=0.6):
	body = {"model": "MiniCPM4.1-8B", "messages": [{"role": "user", "content": prompt}],
	"temperature": temperature, "n": n, "max_tokens": 256,
	"chat_template_kwargs": {"enable_thinking": False}}
	r = requests.post(f"{self.base}/v1/chat/completions",
	headers={"Authorization": f"Bearer {self.token}", "Content-Type": "application/json"},
	json=body, timeout=self.timeout)
	r.raise_for_status()
	from prompt_card.llm.minicpm import clean_content
	return [clean_content(c["message"]["content"]) for c in r.json()["choices"]]


	def main():
	base = os.environ.get("OPENBMB_BASE_URL"); token = os.environ.get("OPENBMB_TOKEN")
	if not base or not token:
	print("ERROR: set OPENBMB_BASE_URL/OPENBMB_TOKEN", file=sys.stderr); sys.exit(2)
	from prompt_card.llm.minicpm import MiniCPMClient
	gt = K.load_gt(); convs = K.load_convs()
	client = K.CachedClient(MiniCPMClient(base, token), workers=8)

	# ---- C1: per-feature IQ ----
	goal_prompts, spec_prompts, rows = [], [], []
	for r in gt:
	ut = K.user_turns(convs[r["id"]])
	for row in r["input_quality"]:
	i = int(row["turn"][1:]) - 1
	goal_prompts.append(goal_prompt(ut[i])); spec_prompts.append(spec_prompt(ut[i]))
	rows.append((int("goal_stated" in row["features"]), int("specific_specification" in row["features"])))
	gres = client.run_all(goal_prompts); sres = client.run_all(spec_prompts)
	gyt = [r[0] for r in rows]; gyp = [int(bool((K.OA.parse(gres[p], ("goal_stated",)) or {}).get("goal_stated"))) for p in goal_prompts]
	syt = [r[1] for r in rows]; syp = [int(bool((K.OA.parse(sres[p], ("specific_specification",)) or {}).get("specific_specification"))) for p in spec_prompts]
	gk = K.cohen_kappa(gyt, gyp); sk = K.cohen_kappa(syt, syp)
	print("=== C1 per-feature IQ (dedicated single-feature prompts) ===")
	print(f" goal_stated κ={gk:+.3f} [TN/FP/FN/TP {K.binary_counts(gyt,gyp)}] (A+B best v2: +0.226)")
	print(f" specific_specification κ={sk:+.3f} [TN/FP/FN/TP {K.binary_counts(syt,syp)}] (A+B best v2: +0.568)")
	print(f" IQ headline (mean) κ={(gk+sk)/2:+.3f} (A+B best: +0.397)")

	# ---- C2: self-consistency interaction ----
	sc = SamplingClient(base, token)
	iyt, iyp = [], []
	items = []
	for r in gt:
	ut = K.user_turns(convs[r["id"]])
	for row in r["interaction"]:
	i = int(row["turn"][1:]) - 1
	items.append((int(bool(row["refinement"])), int_v3(ut[i - 1], ut[i])))
	print(f"\n=== C2 self-consistency interaction (3×, temp 0.6, majority) over {len(items)} turns ===", flush=True)
	import concurrent.futures as cf
	cache = {}

	def vote(prompt):
	outs = sc.sample(prompt, n=3, temperature=0.6)
	yes = 0
	for o in outs:
	d = K.OA.parse(o, ("refinement_attempt",)) or {}
	yes += int(bool(d.get("refinement_attempt")))
	return int(yes >= 2)

	prompts = [p for _, p in items]
	with cf.ThreadPoolExecutor(max_workers=8) as ex:
	votes = list(ex.map(vote, prompts))
	iyt = [t for t, _ in items]; iyp = votes
	ik = K.cohen_kappa(iyt, iyp)
	print(f" interaction (self-consistency) κ={ik:+.3f} [TN/FP/FN/TP {K.binary_counts(iyt,iyp)}] (A+B best v3: +0.320)")

	json.dump({"goal": gk, "spec": sk, "iq_mean": (gk + sk) / 2, "interaction_sc": ik},
	open(os.path.join(os.path.dirname(__file__), "_cache", "step_c.json"), "w"), indent=1)


	if __name__ == "__main__":
	main()