| """ |
| Distractor-trap eval cases: tasks where the OBVIOUS approach is WRONG in a |
| non-obvious way, and the skill's gotcha warns about exactly that trap. |
| |
| Design contract for each case (to keep the eval honest): |
| - task_prompt: a task whose naive solution is tempting but subtly wrong. |
| - the SESSION the skill is built from contains the TRAP as a gotcha (negative |
| knowledge), phrased so it warns WITHOUT just stating this task's final answer. |
| - trap_signature: strings that appear if the model FELL for the distractor. |
| - correct_signature: strings that appear if the model AVOIDED the trap. |
| - leak_terms: if the skill literally contains the task's answer, it's excluded. |
| |
| These are deliberately at the EDGE of frontier ability: each is a known footgun |
| where even strong models often produce confident wrong answers on the first pass. |
| """ |
| from distractor_eval import DistractorCase |
|
|
| CASES = [ |
| |
| DistractorCase( |
| name="float_sum_kahan_trap", |
| session_extraction={ |
| "themes": ["numerical precision", "floating point"], |
| "approaches_tried": [ |
| {"approach": "sum the values in a simple loop", "why_it_failed": |
| "naive accumulation lost low-order bits; the running total swamped small addends and the result drifted"}, |
| ], |
| "dead_ends": [{"position": 0.4, "what_happened": |
| "a plain left-to-right sum gave a visibly wrong total on 1e7 values near machine epsilon"}], |
| "breakthroughs": [{"position": 0.85, "what_worked": |
| "used compensated (Kahan) summation to carry the lost low-order error term"}], |
| "gotchas": [ |
| "Summing many floats with a naive running total silently loses precision because each addition rounds off the small addend against a large accumulator; use compensated summation or sort-by-magnitude.", |
| ], |
| "sentiment_arc": {"start": "confident", "end": "humbled"}, |
| }, |
| task_prompt=("Write a function that returns the sum of a list of 10 million float64 " |
| "values that range over many orders of magnitude (some ~1e8, many ~1e-8). " |
| "Accuracy to the last few bits matters. Give the approach and why."), |
| trap_signature=["simple loop", "just sum", "sum(", "total += ", "naive"], |
| correct_signature=["kahan", "compensated", "neumaier", "sort", "pairwise", "math.fsum", "fsum"], |
| leak_terms=["10 million float64 values that range over many orders"], |
| trap_desc="Summing in a plain left-to-right loop (total += x or sum()), which loses precision.", |
| correct_desc="Compensated summation (Kahan/Neumaier), pairwise summation, sorting by magnitude, or math.fsum.", |
| ), |
|
|
| |
| DistractorCase( |
| name="check_then_act_race", |
| session_extraction={ |
| "themes": ["concurrency", "race conditions"], |
| "approaches_tried": [ |
| {"approach": "if key not in cache: cache[key]=compute()", "why_it_failed": |
| "two threads passed the 'not in' check simultaneously and both computed, double-work and an inconsistent cache"}, |
| ], |
| "dead_ends": [{"position": 0.5, "what_happened": |
| "the check-then-act pattern looked atomic but wasn't; under load the value was computed twice"}], |
| "breakthroughs": [{"position": 0.85, "what_worked": |
| "guarded the check-and-set with a lock, or used an atomic get-or-compute primitive"}], |
| "gotchas": [ |
| "A 'if key not in cache: cache[key] = compute()' is a check-then-act race: two threads can both pass the check before either writes; make the check and write atomic with a lock or a get-or-compute primitive.", |
| ], |
| "sentiment_arc": {"start": "confident", "end": "wary"}, |
| }, |
| task_prompt=("In a multithreaded service, implement a memoizing cache so an expensive " |
| "compute(key) runs at most once per key even under concurrent access. " |
| "Show the implementation and explain why it is correct."), |
| trap_signature=["if key not in", "if key in cache", "not in self", "not in cache"], |
| correct_signature=["lock", "with self._lock", "threading.lock", "atomic", "setdefault", "double-checked", "futures"], |
| leak_terms=["memoizing cache so an expensive compute"], |
| trap_desc="A check-then-act 'if key not in cache: cache[key]=compute()' with no synchronization.", |
| correct_desc="Synchronize the check-and-set with a lock (double-checked), or use an atomic get-or-compute / futures.", |
| ), |
|
|
| |
| DistractorCase( |
| name="n_plus_one_query", |
| session_extraction={ |
| "themes": ["database performance", "ORM"], |
| "approaches_tried": [ |
| {"approach": "loop over parents, query children per parent", "why_it_failed": |
| "issued one query per parent (N+1 queries); correct results but it fell over at scale"}, |
| ], |
| "dead_ends": [{"position": 0.45, "what_happened": |
| "the per-row query loop worked in tests with 5 rows and timed out in prod with 50k"}], |
| "breakthroughs": [{"position": 0.85, "what_worked": |
| "eager-loaded children in one query (join / IN clause / select_related) instead of per-parent queries"}], |
| "gotchas": [ |
| "Looping over parent rows and querying each parent's children separately is the N+1 query trap: it is correct but issues one query per row and dies at scale; fetch related rows in a single query (join or IN) instead.", |
| ], |
| "sentiment_arc": {"start": "confident", "end": "chastened"}, |
| }, |
| task_prompt=("Given 50,000 authors, return each author with their list of books from a " |
| "relational DB via an ORM. Write the query/code and explain the performance " |
| "characteristics."), |
| trap_signature=["for author in", "author.books", "query per", "loop", ".filter(author"], |
| correct_signature=["join", "select_related", "prefetch", "eager", "IN (", "single query", "one query", "joinedload"], |
| leak_terms=["50,000 authors, return each author with their list of books"], |
| trap_desc="Looping over parents and issuing one child query per parent (N+1 queries).", |
| correct_desc="Fetch related rows in a single query: join, IN clause, select_related/prefetch/joinedload.", |
| ), |
| ] |
|
|