TurboSkillSlug / distractor_cases.py
legendarydragontamer's picture
deploy
51a9974
Raw
History Blame Contribute Delete
6.73 kB
"""
Distractor-trap eval cases: tasks where the OBVIOUS approach is WRONG in a
non-obvious way, and the skill's gotcha warns about exactly that trap.
Design contract for each case (to keep the eval honest):
- task_prompt: a task whose naive solution is tempting but subtly wrong.
- the SESSION the skill is built from contains the TRAP as a gotcha (negative
knowledge), phrased so it warns WITHOUT just stating this task's final answer.
- trap_signature: strings that appear if the model FELL for the distractor.
- correct_signature: strings that appear if the model AVOIDED the trap.
- leak_terms: if the skill literally contains the task's answer, it's excluded.
These are deliberately at the EDGE of frontier ability: each is a known footgun
where even strong models often produce confident wrong answers on the first pass.
"""
from distractor_eval import DistractorCase
CASES = [
# 1. Floating-point: summing many small floats naively loses precision.
DistractorCase(
name="float_sum_kahan_trap",
session_extraction={
"themes": ["numerical precision", "floating point"],
"approaches_tried": [
{"approach": "sum the values in a simple loop", "why_it_failed":
"naive accumulation lost low-order bits; the running total swamped small addends and the result drifted"},
],
"dead_ends": [{"position": 0.4, "what_happened":
"a plain left-to-right sum gave a visibly wrong total on 1e7 values near machine epsilon"}],
"breakthroughs": [{"position": 0.85, "what_worked":
"used compensated (Kahan) summation to carry the lost low-order error term"}],
"gotchas": [
"Summing many floats with a naive running total silently loses precision because each addition rounds off the small addend against a large accumulator; use compensated summation or sort-by-magnitude.",
],
"sentiment_arc": {"start": "confident", "end": "humbled"},
},
task_prompt=("Write a function that returns the sum of a list of 10 million float64 "
"values that range over many orders of magnitude (some ~1e8, many ~1e-8). "
"Accuracy to the last few bits matters. Give the approach and why."),
trap_signature=["simple loop", "just sum", "sum(", "total += ", "naive"],
correct_signature=["kahan", "compensated", "neumaier", "sort", "pairwise", "math.fsum", "fsum"],
leak_terms=["10 million float64 values that range over many orders"],
trap_desc="Summing in a plain left-to-right loop (total += x or sum()), which loses precision.",
correct_desc="Compensated summation (Kahan/Neumaier), pairwise summation, sorting by magnitude, or math.fsum.",
),
# 2. Concurrency: check-then-act race on a shared dict / cache.
DistractorCase(
name="check_then_act_race",
session_extraction={
"themes": ["concurrency", "race conditions"],
"approaches_tried": [
{"approach": "if key not in cache: cache[key]=compute()", "why_it_failed":
"two threads passed the 'not in' check simultaneously and both computed, double-work and an inconsistent cache"},
],
"dead_ends": [{"position": 0.5, "what_happened":
"the check-then-act pattern looked atomic but wasn't; under load the value was computed twice"}],
"breakthroughs": [{"position": 0.85, "what_worked":
"guarded the check-and-set with a lock, or used an atomic get-or-compute primitive"}],
"gotchas": [
"A 'if key not in cache: cache[key] = compute()' is a check-then-act race: two threads can both pass the check before either writes; make the check and write atomic with a lock or a get-or-compute primitive.",
],
"sentiment_arc": {"start": "confident", "end": "wary"},
},
task_prompt=("In a multithreaded service, implement a memoizing cache so an expensive "
"compute(key) runs at most once per key even under concurrent access. "
"Show the implementation and explain why it is correct."),
trap_signature=["if key not in", "if key in cache", "not in self", "not in cache"],
correct_signature=["lock", "with self._lock", "threading.lock", "atomic", "setdefault", "double-checked", "futures"],
leak_terms=["memoizing cache so an expensive compute"],
trap_desc="A check-then-act 'if key not in cache: cache[key]=compute()' with no synchronization.",
correct_desc="Synchronize the check-and-set with a lock (double-checked), or use an atomic get-or-compute / futures.",
),
# 3. SQL / N+1: the obvious ORM loop is correct but pathologically slow.
DistractorCase(
name="n_plus_one_query",
session_extraction={
"themes": ["database performance", "ORM"],
"approaches_tried": [
{"approach": "loop over parents, query children per parent", "why_it_failed":
"issued one query per parent (N+1 queries); correct results but it fell over at scale"},
],
"dead_ends": [{"position": 0.45, "what_happened":
"the per-row query loop worked in tests with 5 rows and timed out in prod with 50k"}],
"breakthroughs": [{"position": 0.85, "what_worked":
"eager-loaded children in one query (join / IN clause / select_related) instead of per-parent queries"}],
"gotchas": [
"Looping over parent rows and querying each parent's children separately is the N+1 query trap: it is correct but issues one query per row and dies at scale; fetch related rows in a single query (join or IN) instead.",
],
"sentiment_arc": {"start": "confident", "end": "chastened"},
},
task_prompt=("Given 50,000 authors, return each author with their list of books from a "
"relational DB via an ORM. Write the query/code and explain the performance "
"characteristics."),
trap_signature=["for author in", "author.books", "query per", "loop", ".filter(author"],
correct_signature=["join", "select_related", "prefetch", "eager", "IN (", "single query", "one query", "joinedload"],
leak_terms=["50,000 authors, return each author with their list of books"],
trap_desc="Looping over parents and issuing one child query per parent (N+1 queries).",
correct_desc="Fetch related rows in a single query: join, IN clause, select_related/prefetch/joinedload.",
),
]