lean-laguna / scripts /build_code_dataset.py

Upload folder using huggingface_hub

8cc969e verified about 1 hour ago

9.64 kB

	#!/usr/bin/env python3
	"""build_code_dataset.py — author + validate a 12-problem NON-HumanEval code
	dataset in the exact spec_rl schema {prompt, test, entry_point}, then write it
	to data/adaption_code.jsonl.

	Validation is done with spec_rl's OWN reward core (fraction_passing): for each
	problem we (a) confirm a known-correct reference solution scores 1.0, and (b)
	confirm a deliberately-wrong solution scores < 1.0. This guarantees the eval
	cannot silently run against an all-broken or trivially-passing dataset.
	"""
	from __future__ import annotations
	import json, sys
	from pathlib import Path

	ROOT = Path(__file__).resolve().parents[1]
	REPO = ROOT.parent
	sys.path.insert(0, str(REPO / "environments" / "spec_rl"))
	import spec_rl

	OUT = ROOT / "data" / "adaption_code.jsonl"

	# Each entry: (prompt, test, entry_point, good_body, bad_body)
	# prompt = signature + docstring (no body). test = check() with >=3 asserts.
	PROBLEMS = [
	(
	'def running_total(nums):\n """Return a list where element i is the sum of nums[0..i] inclusive.\n running_total([1, 2, 3]) -> [1, 3, 6]; running_total([]) -> [].\n """\n',
	"def check(candidate):\n assert candidate([1, 2, 3]) == [1, 3, 6]\n assert candidate([]) == []\n assert candidate([5]) == [5]\n assert candidate([-1, 1, -1]) == [-1, 0, -1]\n",
	"running_total",
	" out = []\n s = 0\n for n in nums:\n s += n\n out.append(s)\n return out\n",
	" return nums\n",
	),
	(
	'def count_vowels(s):\n """Return the number of vowels (a, e, i, o, u; case-insensitive) in s.\n count_vowels(\'Hello\') -> 2; count_vowels(\'\') -> 0.\n """\n',
	"def check(candidate):\n assert candidate('Hello') == 2\n assert candidate('') == 0\n assert candidate('AEIOU') == 5\n assert candidate('xyz') == 0\n",
	"count_vowels",
	" return sum(1 for c in s.lower() if c in 'aeiou')\n",
	" return len(s)\n",
	),
	(
	'def merge_counts(a, b):\n """Given two dicts of key->int counts, return a new dict whose value per key\n is the sum of counts from a and b. Keys may appear in either dict.\n merge_counts({\'x\': 1}, {\'x\': 2, \'y\': 3}) -> {\'x\': 3, \'y\': 3}.\n """\n',
	"def check(candidate):\n assert candidate({'x': 1}, {'x': 2, 'y': 3}) == {'x': 3, 'y': 3}\n assert candidate({}, {}) == {}\n assert candidate({'a': 5}, {}) == {'a': 5}\n assert candidate({}, {'b': 7}) == {'b': 7}\n",
	"merge_counts",
	" out = dict(a)\n for k, v in b.items():\n out[k] = out.get(k, 0) + v\n return out\n",
	" return dict(a)\n",
	),
	(
	'def second_largest(nums):\n """Return the second largest DISTINCT value in nums, or None if there are\n fewer than two distinct values.\n second_largest([4, 1, 4, 3]) -> 3; second_largest([7]) -> None.\n """\n',
	"def check(candidate):\n assert candidate([4, 1, 4, 3]) == 3\n assert candidate([7]) is None\n assert candidate([5, 5, 5]) is None\n assert candidate([1, 2, 3, 4]) == 3\n assert candidate([-1, -2]) == -2\n",
	"second_largest",
	" u = sorted(set(nums), reverse=True)\n return u[1] if len(u) >= 2 else None\n",
	" return max(nums)\n",
	),
	(
	'def is_palindrome(s):\n """Return True if s is a palindrome ignoring case and non-alphanumeric chars.\n is_palindrome(\'A man, a plan, a canal: Panama\') -> True; is_palindrome(\'ab\') -> False.\n """\n',
	"def check(candidate):\n assert candidate('A man, a plan, a canal: Panama') is True\n assert candidate('ab') is False\n assert candidate('') is True\n assert candidate('Racecar') is True\n assert candidate('No lemon, no melon') is True\n",
	"is_palindrome",
	" t = [c.lower() for c in s if c.isalnum()]\n return t == t[::-1]\n",
	" return s == s[::-1]\n",
	),
	(
	'def flatten(nested):\n """Flatten a list that may contain nested lists (one level or deep) into a\n single flat list, preserving order.\n flatten([1, [2, [3, 4]], 5]) -> [1, 2, 3, 4, 5].\n """\n',
	"def check(candidate):\n assert candidate([1, [2, [3, 4]], 5]) == [1, 2, 3, 4, 5]\n assert candidate([]) == []\n assert candidate([[1], [2], [3]]) == [1, 2, 3]\n assert candidate([1, 2, 3]) == [1, 2, 3]\n",
	"flatten",
	" out = []\n for x in nested:\n if isinstance(x, list):\n out.extend(candidate_flatten(x))\n else:\n out.append(x)\n return out\ndef candidate_flatten(n):\n out = []\n for x in n:\n if isinstance(x, list):\n out.extend(candidate_flatten(x))\n else:\n out.append(x)\n return out\n",
	" return nested\n",
	),
	(
	'def word_frequencies(text):\n """Return a dict mapping each lowercased word to its count. Words are split on\n whitespace; punctuation is NOT stripped beyond lowercasing.\n word_frequencies(\'a a b\') -> {\'a\': 2, \'b\': 1}.\n """\n',
	"def check(candidate):\n assert candidate('a a b') == {'a': 2, 'b': 1}\n assert candidate('') == {}\n assert candidate('Hi hi HI') == {'hi': 3}\n assert candidate('one') == {'one': 1}\n",
	"word_frequencies",
	" d = {}\n for w in text.lower().split():\n d[w] = d.get(w, 0) + 1\n return d\n",
	" return {}\n",
	),
	(
	'def chunk(seq, size):\n """Split list seq into consecutive chunks of length size (the last chunk may be\n shorter). size is a positive integer.\n chunk([1, 2, 3, 4, 5], 2) -> [[1, 2], [3, 4], [5]].\n """\n',
	"def check(candidate):\n assert candidate([1, 2, 3, 4, 5], 2) == [[1, 2], [3, 4], [5]]\n assert candidate([], 3) == []\n assert candidate([1, 2, 3], 1) == [[1], [2], [3]]\n assert candidate([1, 2], 5) == [[1, 2]]\n",
	"chunk",
	" return [seq[i:i+size] for i in range(0, len(seq), size)]\n",
	" return [seq]\n",
	),
	(
	'def gcd(a, b):\n """Return the greatest common divisor of two non-negative integers a and b.\n gcd(12, 18) -> 6; gcd(7, 0) -> 7.\n """\n',
	"def check(candidate):\n assert candidate(12, 18) == 6\n assert candidate(7, 0) == 7\n assert candidate(0, 5) == 5\n assert candidate(17, 13) == 1\n assert candidate(100, 80) == 20\n",
	"gcd",
	" while b:\n a, b = b, a % b\n return a\n",
	" return a\n",
	),
	(
	'def title_case(s):\n """Return s with the first letter of each whitespace-separated word uppercased\n and the rest lowercased.\n title_case(\'hELLO wORLD\') -> \'Hello World\'.\n """\n',
	"def check(candidate):\n assert candidate('hELLO wORLD') == 'Hello World'\n assert candidate('') == ''\n assert candidate('a') == 'A'\n assert candidate('the QUICK brown') == 'The Quick Brown'\n",
	"title_case",
	" return ' '.join(w[:1].upper() + w[1:].lower() for w in s.split())\n",
	" return s.upper()\n",
	),
	(
	'def dedupe_preserve_order(items):\n """Return a list with duplicates removed, keeping the FIRST occurrence order.\n dedupe_preserve_order([3, 1, 3, 2, 1]) -> [3, 1, 2].\n """\n',
	"def check(candidate):\n assert candidate([3, 1, 3, 2, 1]) == [3, 1, 2]\n assert candidate([]) == []\n assert candidate([1, 1, 1]) == [1]\n assert candidate(['a', 'b', 'a']) == ['a', 'b']\n",
	"dedupe_preserve_order",
	" seen = set()\n out = []\n for x in items:\n if x not in seen:\n seen.add(x)\n out.append(x)\n return out\n",
	" return list(set(items))\n",
	),
	(
	'def roman_to_int(s):\n """Convert a Roman numeral string (I, V, X, L, C, D, M; valid, uppercase) to an int.\n roman_to_int(\'IV\') -> 4; roman_to_int(\'XIV\') -> 14; roman_to_int(\'MCMXCIV\') -> 1994.\n """\n',
	"def check(candidate):\n assert candidate('IV') == 4\n assert candidate('XIV') == 14\n assert candidate('MCMXCIV') == 1994\n assert candidate('III') == 3\n assert candidate('LVIII') == 58\n",
	"roman_to_int",
	" vals = {'I':1,'V':5,'X':10,'L':50,'C':100,'D':500,'M':1000}\n total = 0\n prev = 0\n for c in reversed(s):\n v = vals[c]\n if v < prev:\n total -= v\n else:\n total += v\n prev = v\n return total\n",
	" return sum({'I':1,'V':5,'X':10,'L':50,'C':100,'D':500,'M':1000}[c] for c in s)\n",
	),
	]


	def main() -> int:
	rows = []
	failures = []
	for i, (prompt, test, ep, good, bad) in enumerate(PROBLEMS):
	prob = {"prompt": prompt, "test": test, "entry_point": ep}
	good_score = spec_rl.fraction_passing(prob, good)
	bad_score = spec_rl.fraction_passing(prob, bad)
	ok = (good_score == 1.0) and (bad_score < 1.0)
	status = "OK" if ok else "BAD"
	print(f"[{status}] {ep:24} good={good_score:.3f} bad={bad_score:.3f}")
	if not ok:
	failures.append((ep, good_score, bad_score))
	rows.append({**prob, "task_id": f"adaption_{i}"})
	if failures:
	print("VALIDATION FAILURES:", failures, file=sys.stderr)
	return 1
	OUT.parent.mkdir(parents=True, exist_ok=True)
	with open(OUT, "w") as f:
	for r in rows:
	f.write(json.dumps(r) + "\n")
	print(f"\nWROTE {len(rows)} validated rows -> {OUT}")
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())