""" Distractor-trap eval cases: tasks where the OBVIOUS approach is WRONG in a non-obvious way, and the skill's gotcha warns about exactly that trap. Design contract for each case (to keep the eval honest): - task_prompt: a task whose naive solution is tempting but subtly wrong. - the SESSION the skill is built from contains the TRAP as a gotcha (negative knowledge), phrased so it warns WITHOUT just stating this task's final answer. - trap_signature: strings that appear if the model FELL for the distractor. - correct_signature: strings that appear if the model AVOIDED the trap. - leak_terms: if the skill literally contains the task's answer, it's excluded. These are deliberately at the EDGE of frontier ability: each is a known footgun where even strong models often produce confident wrong answers on the first pass. """ from distractor_eval import DistractorCase CASES = [ # 1. Floating-point: summing many small floats naively loses precision. DistractorCase( name="float_sum_kahan_trap", session_extraction={ "themes": ["numerical precision", "floating point"], "approaches_tried": [ {"approach": "sum the values in a simple loop", "why_it_failed": "naive accumulation lost low-order bits; the running total swamped small addends and the result drifted"}, ], "dead_ends": [{"position": 0.4, "what_happened": "a plain left-to-right sum gave a visibly wrong total on 1e7 values near machine epsilon"}], "breakthroughs": [{"position": 0.85, "what_worked": "used compensated (Kahan) summation to carry the lost low-order error term"}], "gotchas": [ "Summing many floats with a naive running total silently loses precision because each addition rounds off the small addend against a large accumulator; use compensated summation or sort-by-magnitude.", ], "sentiment_arc": {"start": "confident", "end": "humbled"}, }, task_prompt=("Write a function that returns the sum of a list of 10 million float64 " "values that range over many orders of magnitude (some ~1e8, many ~1e-8). " "Accuracy to the last few bits matters. Give the approach and why."), trap_signature=["simple loop", "just sum", "sum(", "total += ", "naive"], correct_signature=["kahan", "compensated", "neumaier", "sort", "pairwise", "math.fsum", "fsum"], leak_terms=["10 million float64 values that range over many orders"], trap_desc="Summing in a plain left-to-right loop (total += x or sum()), which loses precision.", correct_desc="Compensated summation (Kahan/Neumaier), pairwise summation, sorting by magnitude, or math.fsum.", ), # 2. Concurrency: check-then-act race on a shared dict / cache. DistractorCase( name="check_then_act_race", session_extraction={ "themes": ["concurrency", "race conditions"], "approaches_tried": [ {"approach": "if key not in cache: cache[key]=compute()", "why_it_failed": "two threads passed the 'not in' check simultaneously and both computed, double-work and an inconsistent cache"}, ], "dead_ends": [{"position": 0.5, "what_happened": "the check-then-act pattern looked atomic but wasn't; under load the value was computed twice"}], "breakthroughs": [{"position": 0.85, "what_worked": "guarded the check-and-set with a lock, or used an atomic get-or-compute primitive"}], "gotchas": [ "A 'if key not in cache: cache[key] = compute()' is a check-then-act race: two threads can both pass the check before either writes; make the check and write atomic with a lock or a get-or-compute primitive.", ], "sentiment_arc": {"start": "confident", "end": "wary"}, }, task_prompt=("In a multithreaded service, implement a memoizing cache so an expensive " "compute(key) runs at most once per key even under concurrent access. " "Show the implementation and explain why it is correct."), trap_signature=["if key not in", "if key in cache", "not in self", "not in cache"], correct_signature=["lock", "with self._lock", "threading.lock", "atomic", "setdefault", "double-checked", "futures"], leak_terms=["memoizing cache so an expensive compute"], trap_desc="A check-then-act 'if key not in cache: cache[key]=compute()' with no synchronization.", correct_desc="Synchronize the check-and-set with a lock (double-checked), or use an atomic get-or-compute / futures.", ), # 3. SQL / N+1: the obvious ORM loop is correct but pathologically slow. DistractorCase( name="n_plus_one_query", session_extraction={ "themes": ["database performance", "ORM"], "approaches_tried": [ {"approach": "loop over parents, query children per parent", "why_it_failed": "issued one query per parent (N+1 queries); correct results but it fell over at scale"}, ], "dead_ends": [{"position": 0.45, "what_happened": "the per-row query loop worked in tests with 5 rows and timed out in prod with 50k"}], "breakthroughs": [{"position": 0.85, "what_worked": "eager-loaded children in one query (join / IN clause / select_related) instead of per-parent queries"}], "gotchas": [ "Looping over parent rows and querying each parent's children separately is the N+1 query trap: it is correct but issues one query per row and dies at scale; fetch related rows in a single query (join or IN) instead.", ], "sentiment_arc": {"start": "confident", "end": "chastened"}, }, task_prompt=("Given 50,000 authors, return each author with their list of books from a " "relational DB via an ORM. Write the query/code and explain the performance " "characteristics."), trap_signature=["for author in", "author.books", "query per", "loop", ".filter(author"], correct_signature=["join", "select_related", "prefetch", "eager", "IN (", "single query", "one query", "joinedload"], leak_terms=["50,000 authors, return each author with their list of books"], trap_desc="Looping over parents and issuing one child query per parent (N+1 queries).", correct_desc="Fetch related rows in a single query: join, IN clause, select_related/prefetch/joinedload.", ), ]