Spaces:

build-small-hackathon
/

TurboSkillSlug

Sleeping

App Files Files Community

TurboSkillSlug / distractor_cases.py

legendarydragontamer

deploy

51a9974 20 days ago

Raw

History Blame Contribute Delete

6.73 kB

	"""
	Distractor-trap eval cases: tasks where the OBVIOUS approach is WRONG in a
	non-obvious way, and the skill's gotcha warns about exactly that trap.

	Design contract for each case (to keep the eval honest):
	- task_prompt: a task whose naive solution is tempting but subtly wrong.
	- the SESSION the skill is built from contains the TRAP as a gotcha (negative
	knowledge), phrased so it warns WITHOUT just stating this task's final answer.
	- trap_signature: strings that appear if the model FELL for the distractor.
	- correct_signature: strings that appear if the model AVOIDED the trap.
	- leak_terms: if the skill literally contains the task's answer, it's excluded.

	These are deliberately at the EDGE of frontier ability: each is a known footgun
	where even strong models often produce confident wrong answers on the first pass.
	"""
	from distractor_eval import DistractorCase

	CASES = [
	# 1. Floating-point: summing many small floats naively loses precision.
	DistractorCase(
	name="float_sum_kahan_trap",
	session_extraction={
	"themes": ["numerical precision", "floating point"],
	"approaches_tried": [
	{"approach": "sum the values in a simple loop", "why_it_failed":
	"naive accumulation lost low-order bits; the running total swamped small addends and the result drifted"},
	],
	"dead_ends": [{"position": 0.4, "what_happened":
	"a plain left-to-right sum gave a visibly wrong total on 1e7 values near machine epsilon"}],
	"breakthroughs": [{"position": 0.85, "what_worked":
	"used compensated (Kahan) summation to carry the lost low-order error term"}],
	"gotchas": [
	"Summing many floats with a naive running total silently loses precision because each addition rounds off the small addend against a large accumulator; use compensated summation or sort-by-magnitude.",
	],
	"sentiment_arc": {"start": "confident", "end": "humbled"},
	},
	task_prompt=("Write a function that returns the sum of a list of 10 million float64 "
	"values that range over many orders of magnitude (some ~1e8, many ~1e-8). "
	"Accuracy to the last few bits matters. Give the approach and why."),
	trap_signature=["simple loop", "just sum", "sum(", "total += ", "naive"],
	correct_signature=["kahan", "compensated", "neumaier", "sort", "pairwise", "math.fsum", "fsum"],
	leak_terms=["10 million float64 values that range over many orders"],
	trap_desc="Summing in a plain left-to-right loop (total += x or sum()), which loses precision.",
	correct_desc="Compensated summation (Kahan/Neumaier), pairwise summation, sorting by magnitude, or math.fsum.",
	),

	# 2. Concurrency: check-then-act race on a shared dict / cache.
	DistractorCase(
	name="check_then_act_race",
	session_extraction={
	"themes": ["concurrency", "race conditions"],
	"approaches_tried": [
	{"approach": "if key not in cache: cache[key]=compute()", "why_it_failed":
	"two threads passed the 'not in' check simultaneously and both computed, double-work and an inconsistent cache"},
	],
	"dead_ends": [{"position": 0.5, "what_happened":
	"the check-then-act pattern looked atomic but wasn't; under load the value was computed twice"}],
	"breakthroughs": [{"position": 0.85, "what_worked":
	"guarded the check-and-set with a lock, or used an atomic get-or-compute primitive"}],
	"gotchas": [
	"A 'if key not in cache: cache[key] = compute()' is a check-then-act race: two threads can both pass the check before either writes; make the check and write atomic with a lock or a get-or-compute primitive.",
	],
	"sentiment_arc": {"start": "confident", "end": "wary"},
	},
	task_prompt=("In a multithreaded service, implement a memoizing cache so an expensive "
	"compute(key) runs at most once per key even under concurrent access. "
	"Show the implementation and explain why it is correct."),
	trap_signature=["if key not in", "if key in cache", "not in self", "not in cache"],
	correct_signature=["lock", "with self._lock", "threading.lock", "atomic", "setdefault", "double-checked", "futures"],
	leak_terms=["memoizing cache so an expensive compute"],
	trap_desc="A check-then-act 'if key not in cache: cache[key]=compute()' with no synchronization.",
	correct_desc="Synchronize the check-and-set with a lock (double-checked), or use an atomic get-or-compute / futures.",
	),

	# 3. SQL / N+1: the obvious ORM loop is correct but pathologically slow.
	DistractorCase(
	name="n_plus_one_query",
	session_extraction={
	"themes": ["database performance", "ORM"],
	"approaches_tried": [
	{"approach": "loop over parents, query children per parent", "why_it_failed":
	"issued one query per parent (N+1 queries); correct results but it fell over at scale"},
	],
	"dead_ends": [{"position": 0.45, "what_happened":
	"the per-row query loop worked in tests with 5 rows and timed out in prod with 50k"}],
	"breakthroughs": [{"position": 0.85, "what_worked":
	"eager-loaded children in one query (join / IN clause / select_related) instead of per-parent queries"}],
	"gotchas": [
	"Looping over parent rows and querying each parent's children separately is the N+1 query trap: it is correct but issues one query per row and dies at scale; fetch related rows in a single query (join or IN) instead.",
	],
	"sentiment_arc": {"start": "confident", "end": "chastened"},
	},
	task_prompt=("Given 50,000 authors, return each author with their list of books from a "
	"relational DB via an ORM. Write the query/code and explain the performance "
	"characteristics."),
	trap_signature=["for author in", "author.books", "query per", "loop", ".filter(author"],
	correct_signature=["join", "select_related", "prefetch", "eager", "IN (", "single query", "one query", "joinedload"],
	leak_terms=["50,000 authors, return each author with their list of books"],
	trap_desc="Looping over parents and issuing one child query per parent (N+1 queries).",
	correct_desc="Fetch related rows in a single query: join, IN clause, select_related/prefetch/joinedload.",
	),
	]