promptstat / eval /_ceiling /prep_p7.py
xxixx1028's picture
Deploy PromptStat — UI shell + MiniCPM4.1-8B + 4-LoRA hybrid (Modal)
dc9f530 verified
Raw
History Blame Contribute Delete
2.93 kB
"""Phase-7 ceiling-check prep for Interaction + Critical (goal/decomp reuse Phase-6 artifacts).
Builds 50-unit stratified samples (positive-enriched so inter-model κ is stable) from the 159-conv
validation set, with Kim GT attached, plus a BLIND version (context only) for Opus/Sonnet labelers.
Interaction unit: (prev_user, this_user) -> refinement_attempt bool.
Critical unit: (prev_assistant, this_user) -> set of CE types (5).
Run: python -m eval._ceiling.prep_p7
"""
from __future__ import annotations
import json
import os
import random
from eval import kappa as K
from prompt_card.scoring import observable_axes as OA
from eval.step_critical import CE
HERE = os.path.dirname(__file__)
SEED = 7
N = 50
def _interaction_units(gt, convs):
units = []
for r in gt:
ut = K.user_turns(convs[r["id"]])
for row in r["interaction"]:
i = int(row["turn"][1:]) - 1
if i < 1 or i >= len(ut):
continue
units.append({"cid": r["id"], "turn": row["turn"], "prev_user": ut[i - 1],
"this_user": ut[i], "kim": bool(row["refinement"])})
return units
def _critical_units(gt, convs):
units = []
for r in gt:
conv = convs[r["id"]]; ut = K.user_turns(conv)
for row in r["critical"]:
i = int(row["turn"][1:]) - 1
if i < 0 or i >= len(ut):
continue
units.append({"cid": r["id"], "turn": row["turn"],
"prev_assistant": OA._prev_assistant(conv, i) or "",
"this_user": ut[i], "kim": sorted(row["types"])})
return units
def _stratified(units, is_pos, n, rng, pos_frac):
pos = [u for u in units if is_pos(u)]
neg = [u for u in units if not is_pos(u)]
rng.shuffle(pos); rng.shuffle(neg)
npos = min(len(pos), int(n * pos_frac))
sample = pos[:npos] + neg[:n - npos]
rng.shuffle(sample)
for k, u in enumerate(sample):
u["idx"] = k
return sample
def _write(name, sample, blind_fields):
json.dump(sample, open(os.path.join(HERE, f"{name}_samples_p7.json"), "w"), ensure_ascii=False, indent=1)
blind = [{**{"idx": u["idx"]}, **{f: u[f] for f in blind_fields}} for u in sample]
json.dump(blind, open(os.path.join(HERE, f"{name}_blind_p7.json"), "w"), ensure_ascii=False, indent=1)
print(f"[prep_p7] {name}: {len(sample)} units ({sum(1 for u in sample if (u['kim'] if isinstance(u['kim'],bool) else u['kim']))} pos-ish)")
def main():
rng = random.Random(SEED)
gt = K.load_gt(); convs = K.load_convs()
inter = _stratified(_interaction_units(gt, convs), lambda u: u["kim"], N, rng, 0.4)
_write("interaction", inter, ["prev_user", "this_user"])
crit = _stratified(_critical_units(gt, convs), lambda u: len(u["kim"]) > 0, N, rng, 0.5)
_write("critical", crit, ["prev_assistant", "this_user"])
if __name__ == "__main__":
main()