affine-win-13 / evaluation /eval /tools /code_verifier_utils.py

updated models (2026-01-06)

ebe2ddc verified 2 months ago

21.2 kB

	import ast
	import json
	import sys
	import faulthandler
	import platform
	import re
	# used for debugging to time steps
	from datetime import datetime
	import os
	# to run the solution files we're using a timing based approach
	import signal
	import time
	import numpy as np

	from io import StringIO

	# used for testing the code that reads from input
	from unittest.mock import patch, mock_open

	# from pyext import RuntimeModule
	from types import ModuleType

	from enum import Enum
	from decimal import Decimal
	import time
	from typing import List

	import_string = "import sys\nsys.setrecursionlimit(1 << 25)\nsys.set_int_max_str_digits(1 << 25)\nimport time\nimport re\nimport bisect\nimport itertools\nfrom itertools import accumulate, product, permutations, combinations\nimport collections\nfrom collections import Counter, OrderedDict, deque, defaultdict, ChainMap\nfrom functools import lru_cache\nimport math\nfrom math import sqrt, sin, cos, tan, ceil, fabs, floor, gcd, exp, log, log2\nimport fractions\nfrom typing import List, Tuple\nimport numpy as np\nimport random\nimport heapq\nfrom heapq import *\n"

	def convert_python_literal_to_json(s):
	obj = ast.literal_eval(s)
	return json.dumps(obj)

	def convert_python_literal(arg_str):
	s = arg_str.strip()
	# If the whole string starts with '[' and ends with ']', assume it's one literal
	# and force it to be treated as a single-element tuple by adding a trailing comma.
	if s.startswith('[') and s.endswith(']'):
	wrapped_str = f"({arg_str},)"
	else:
	wrapped_str = f"({arg_str})"

	try:
	parsed_tuple = ast.literal_eval(wrapped_str)
	except Exception as e:
	raise ValueError(f"Error parsing argument string: {arg_str}") from e
	return list(parsed_tuple)

	def post_process_code(code):
	code = code.split("</code>")[0]
	code = code.replace("```python", "")
	code = code.split("```")[0]
	code = code.replace("<code>", "")
	return code

	def extract_functions(code_str):
	tree = ast.parse(code_str)
	extracted = []

	# Iterate over top-level nodes in the module.
	for node in tree.body:
	# Check for import statements.
	if isinstance(node, (ast.Import, ast.ImportFrom, ast.Assign)):
	code_segment = ast.get_source_segment(code_str, node)
	extracted.append(code_segment)
	# Check for top-level function definitions.
	elif isinstance(node, ast.FunctionDef):
	code_segment = ast.get_source_segment(code_str, node)
	extracted.append(code_segment)
	# Check for class definitions.
	elif isinstance(node, ast.ClassDef):
	code_segment = ast.get_source_segment(code_str, node)
	extracted.append(code_segment)

	return extracted

	def has_code(response):
	pattern = r"```python(?:[a-zA-Z0-9])\n(.?)```"
	matches = re.findall(pattern, response, re.DOTALL)
	return matches

	def truncatefn(s, length=300):
	if isinstance(s, str):
	pass
	else:
	s = str(s)
	if len(s) <= length:
	return s

	return s[: length // 2] + "...(truncated) ..." + s[-length // 2 :]


	class CODE_TYPE(Enum):
	call_based = 0
	standard_input = 1


	# stuff for setting up signal timer
	class TimeoutException(Exception):
	pass


	def timeout_handler(signum, frame):
	print("timeout occured: alarm went off")
	raise TimeoutException


	# used to capture stdout as a list
	# from https://stackoverflow.com/a/16571630/6416660
	# alternative use redirect_stdout() from contextlib
	class Capturing(list):
	def __enter__(self):
	self._stdout = sys.stdout
	sys.stdout = self._stringio = StringIO()
	# Make closing the StringIO a no-op
	self._stringio.close = lambda x: 1
	return self

	def __exit__(self, *args):
	self.append(self._stringio.getvalue())
	del self._stringio # free up some memory
	sys.stdout = self._stdout


	def clean_if_name(code: str) -> str:
	try:
	astree = ast.parse(code)
	last_block = astree.body[-1]
	if isinstance(last_block, ast.If):
	condition = last_block.test
	if ast.unparse(condition).strip() == "__name__ == '__main__'":
	code = (
	ast.unparse(astree.body[:-1]) + "\n" + ast.unparse(last_block.body) # type: ignore
	)
	except:
	pass

	return code


	def make_function(code: str) -> str:
	try:
	import_stmts = []
	all_other_stmts = []
	astree = ast.parse(code)
	for stmt in astree.body:
	if isinstance(stmt, (ast.Import, ast.ImportFrom)):
	import_stmts.append(stmt)
	else:
	all_other_stmts.append(stmt)

	function_ast = ast.FunctionDef(
	name="wrapped_function",
	args=ast.arguments(
	posonlyargs=[], args=[], kwonlyargs=[], kw_defaults=[], defaults=[]
	),
	body=all_other_stmts,
	decorator_list=[],
	lineno=-1,
	)
	main_code = (
	import_string
	+ "\n"
	+ ast.unparse(import_stmts) # type: ignore
	+ "\n"
	+ ast.unparse(function_ast) # type: ignore
	)
	return main_code
	except Exception as e:
	return code


	def call_method(method, inputs):

	if isinstance(inputs, list):
	inputs = "\n".join(inputs)

	inputs_line_iterator = iter(inputs.split("\n"))

	@patch("builtins.open", mock_open(read_data=inputs))
	@patch("sys.stdin", StringIO(inputs))
	@patch("sys.stdin.readline", lambda *args: next(inputs_line_iterator))
	@patch("sys.stdin.readlines", lambda *args: inputs.split("\n"))
	@patch("sys.stdin.read", lambda *args: inputs)
	def _inner_call_method(_method):
	try:
	return _method()
	except SystemExit as e:
	pass
	finally:
	pass

	return _inner_call_method(method)


	def get_function(compiled_sol, fn_name: str): # type: ignore
	try:
	assert hasattr(compiled_sol, fn_name)
	return getattr(compiled_sol, fn_name)
	except Exception as e:
	return


	def compile_code(code: str, timeout: int):
	signal.alarm(timeout)
	try:
	tmp_sol = ModuleType("tmp_sol", "")
	exec(code, tmp_sol.__dict__)
	if "class Solution" in code:
	# leetcode wraps solutions in `Solution`
	# this is a hack to check if it is leetcode solution or not
	# currently livecodebench only supports LeetCode but
	# else condition allows future extensibility to other platforms
	compiled_sol = tmp_sol.Solution()
	else:
	# do nothing in the other case since function is accesible
	compiled_sol = tmp_sol

	assert compiled_sol is not None
	finally:
	signal.alarm(0)

	return compiled_sol


	# def convert_line_to_decimals(line: str) -> tuple[bool, List[Float]]:
	def convert_line_to_decimals(line: str):
	try:
	decimal_line = [float(elem) for elem in line.split()]
	except:
	return False, []
	return True, decimal_line


	def get_stripped_lines(val: str):
	## you don't want empty lines to add empty list after splitlines!
	val = val.strip()

	return [val_line.strip() for val_line in val.split("\n")]


	def grade_call_based(
	code: str, all_inputs: list, all_outputs: list, fn_name: str, timeout: int
	):

	# call-based clean up logic
	# need to wrap in try-catch logic after to catch the correct errors, but for now this is fine.
	base_code = import_string + "\n\n" + code + '\n\n'

	if "class Solution" in code:
	base_code += 'def run_main_code(self):\n'
	else:
	base_code += 'def run_main_code():\n'

	all_inputs = [
	inputs.replace("\n", ",") for inputs in all_inputs
	]

	triggered = False

	try:
	all_outputs = [json.loads(output) for output in all_outputs]
	triggered = True
	except Exception:
	pass

	if not triggered:
	try:
	all_outputs = [
	json.loads(convert_python_literal_to_json(outputs)) for outputs in all_outputs
	]
	triggered = True
	except Exception:
	pass

	if not triggered:
	all_outputs = [
	convert_python_literal(outputs) for outputs in all_outputs
	]

	total_execution = 0
	all_results = []
	for idx, (gt_inp, gt_out) in enumerate(zip(all_inputs, all_outputs)):
	code = base_code
	if 'class Solution' in code:
	code += f" return self.{fn_name}({gt_inp})\nSolution.run_main_code = run_main_code\n"
	else:
	code += f" return {fn_name}({gt_inp})\n"

	compiled_sol = compile_code(code, timeout)
	if compiled_sol is None:
	return

	method = get_function(compiled_sol, "run_main_code")

	if method is None:
	return

	signal.alarm(timeout)
	faulthandler.enable()
	try:
	# can lock here so time is useful
	start = time.time()
	prediction = method()
	total_execution += time.time() - start
	signal.alarm(0)

	# don't penalize model if it produces tuples instead of lists
	# ground truth sequences are not tuples
	if isinstance(prediction, tuple):
	prediction = list(prediction)

	tmp_result = prediction == gt_out

	try:
	tmp_result = tmp_result or (json.dumps(prediction) == json.dumps(gt_out))
	except Exception:
	pass

	try:
	tmp_result = tmp_result or (np.allclose(float(prediction), float(gt_out)))
	except Exception:
	pass

	if isinstance(prediction, list):
	try:
	if len(prediction) == len(gt_out):
	all_matched = True
	for i in range(len(prediction)):
	if np.allclose(float(prediction[i]), float(gt_out[i])) == False:
	all_matched = False
	break
	tmp_result = tmp.result or all_matched
	except Exception:
	pass


	# handle floating point comparisons

	all_results.append(tmp_result)

	if not tmp_result:
	return all_results, {
	"inputs": truncatefn(gt_inp),
	"output": truncatefn(prediction),
	"expected": truncatefn(gt_out),
	"error_code": -2,
	"error_message": "Wrong Answer",
	}
	except Exception as e:
	signal.alarm(0)
	if "timeoutexception" in repr(e).lower():
	all_results.append(-3)
	return all_results, {
	"error": repr(e),
	"error_code": -3,
	"error_message": "Time Limit Exceeded",
	"inputs": truncatefn(gt_inp),
	"expected": truncatefn(gt_out),
	}
	else:
	all_results.append(-4)
	return all_results, {
	"error": repr(e),
	"error_code": -4,
	"error_message": "Runtime Error",
	"inputs": truncatefn(gt_inp),
	"expected": truncatefn(gt_out),
	}

	finally:
	signal.alarm(0)
	faulthandler.disable()

	return all_results, {"execution time": total_execution}


	def grade_stdio(
	code: str,
	all_inputs: list,
	all_outputs: list,
	timeout: int,
	):
	## runtime doesn't interact well with __name__ == '__main__'
	code = clean_if_name(code)

	## we wrap the given code inside another function
	code = make_function(code)

	compiled_sol = compile_code(code, timeout)

	if compiled_sol is None:
	return

	method = get_function(compiled_sol, "wrapped_function")

	if method is None:
	return

	all_results = []
	total_execution_time = 0
	for idx, (gt_inp, gt_out) in enumerate(zip(all_inputs, all_outputs)):
	signal.alarm(timeout)
	faulthandler.enable()

	signal.alarm(timeout)
	with Capturing() as captured_output:
	try:
	start = time.time()
	call_method(method, gt_inp)
	total_execution_time += time.time() - start
	# reset the alarm
	signal.alarm(0)
	except Exception as e:
	signal.alarm(0)
	if "timeoutexception" in repr(e).lower():
	all_results.append(-3)
	return all_results, {
	"error": repr(e),
	"error_code": -3,
	"error_message": "Time Limit Exceeded",
	"inputs": truncatefn(gt_inp),
	"expected": truncatefn(gt_out),
	}
	else:
	all_results.append(-4)
	return all_results, {
	"error": repr(e),
	"error_code": -4,
	"error_message": "Runtime Error",
	"inputs": truncatefn(gt_inp),
	"expected": truncatefn(gt_out),
	}

	finally:
	signal.alarm(0)
	faulthandler.disable()

	prediction = captured_output[0]

	stripped_prediction_lines = get_stripped_lines(prediction)
	stripped_gt_out_lines = get_stripped_lines(gt_out)

	## WA happens in multiple circumstances
	## so cache the return to make it clean!
	WA_send_args = {
	"inputs": truncatefn(gt_inp),
	"output": truncatefn(prediction),
	"expected": truncatefn(gt_out),
	"error_code": -2,
	}

	if len(stripped_prediction_lines) != len(stripped_gt_out_lines):
	all_results.append(-2)
	WA_send_args["error_message"] = "Wrong answer: mismatched output length"
	return all_results, WA_send_args

	for output_line_idx, (
	stripped_prediction_line,
	stripped_gt_out_line,
	) in enumerate(zip(stripped_prediction_lines, stripped_gt_out_lines)):
	WA_send_args["error_message"] = (
	f"Wrong answer at {output_line_idx=}: {truncatefn(stripped_prediction_line)} != {truncatefn(stripped_gt_out_line)}"
	)
	## CASE 1: exact match
	if stripped_prediction_line == stripped_gt_out_line:
	continue

	## CASE 2: element-wise comparision
	## if there are floating elements
	## use `decimal` library for good floating point comparision
	## otherwise gotcha: np.isclose(50000000000000000, 50000000000000001) = True
	## note that we should always be able to convert to decimals

	success, decimal_prediction_line = convert_line_to_decimals(
	stripped_prediction_line
	)

	if not success:
	all_results.append(-2)
	return all_results, WA_send_args

	success, decimal_gtout_line = convert_line_to_decimals(stripped_gt_out_line)

	if not success:
	all_results.append(-2)
	return all_results, WA_send_args

	if decimal_prediction_line == decimal_gtout_line:
	continue

	ok = False
	try:
	if len(decimal_prediction_line) == len(decimal_gtout_line) and np.allclose(decimal_prediction_line, decimal_gtout_line):
	ok = True
	except Exception:
	pass

	if ok:
	continue
	all_results.append(-2)
	return all_results, WA_send_args
	all_results.append(True)

	return all_results, {"execution time": total_execution_time}


	def run_test(problem_to_check, debug=False, timeout=6):
	"""
	if test(generated_code) is not None it'll try to run the code.
	otherwise it'll just return an input and output pair.
	"""
	signal.signal(signal.SIGALRM, timeout_handler)
	# Disable functionalities that can make destructive changes to the test.
	# max memory is set to 4GB
	reliability_guard()

	generation = problem_to_check['generation']

	generation = has_code(generation)

	if len(generation) >= 1 and generation[0] == 'Your code\n':
	generation = generation[1:]

	if len(generation) == 0:
	return [-1], "No code found."

	if debug:
	print(f"start = {datetime.now().time()}")

	if problem_to_check['starter_code'] == '':
	which_type = CODE_TYPE.standard_input # Standard input
	method_name = None
	generation = generation[-1]
	else:
	starter = problem_to_check['starter_code']
	which_type = CODE_TYPE.call_based # Call-based
	method_name = problem_to_check['starter_code']
	generation = "\n".join(extract_functions('\n'.join(generation)))

	generation = post_process_code(generation).replace("\t", " ").replace('if __name__ == "__main__":', 'if True:')

	inputs, outputs = [], []
	for in_out in problem_to_check['input_output']:
	if os.path.exists(in_out['input']): # File based
	with open(in_out['input'], 'r', encoding='utf-8') as f:
	inputs.append(f.read().strip())
	with open(in_out['output'], 'r', encoding='utf-8') as f:
	outputs.append(f.read().strip())
	else: # plain-text based
	inputs.append(in_out['input'])
	outputs.append(in_out['output'])

	assert(len(inputs) == len(outputs))

	if debug:
	print(f"loaded {len(inputs)} Input-Output pairs = {datetime.now().time()}")

	results = []
	if debug:
	print(f"loading test code = {datetime.now().time()}")

	if which_type == CODE_TYPE.call_based:
	signal.alarm(timeout)
	try:
	results, metadata = grade_call_based(
	code=generation,
	all_inputs=inputs,
	all_outputs=outputs,
	fn_name=method_name,
	timeout=timeout,
	)
	return results, metadata
	except Exception as e:
	return [-4], f"Call-based Error as {e}"
	finally:
	signal.alarm(0)

	elif which_type == CODE_TYPE.standard_input:
	signal.alarm(timeout)
	try:
	results, metadata = grade_stdio(
	code=generation,
	all_inputs=inputs,
	all_outputs=outputs,
	timeout=timeout,
	)
	return results, metadata
	except Exception as e:
	return [-4], f"Stdin-based Error as {e}"
	finally:
	signal.alarm(0)


	def reliability_guard(maximum_memory_bytes=None):
	"""
	This disables various destructive functions and prevents the generated code
	from interfering with the test (e.g. fork bomb, killing other processes,
	removing filesystem files, etc.)
	WARNING
	This function is NOT a security sandbox. Untrusted code, including, model-
	generated code, should not be blindly executed outside of one. See the
	Codex paper for more information about OpenAI's code sandbox, and proceed
	with caution.
	"""

	if maximum_memory_bytes is not None:
	import resource

	resource.setrlimit(
	resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes)
	)
	resource.setrlimit(
	resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes)
	)
	if not platform.uname().system == "Darwin":
	resource.setrlimit(
	resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes)
	)

	faulthandler.disable()

	import builtins

	# builtins.exit = None
	builtins.quit = None

	import os

	os.environ["OMP_NUM_THREADS"] = "1"

	os.kill = None
	os.system = None
	os.putenv = None
	os.remove = None
	os.removedirs = None
	os.rmdir = None
	os.fchdir = None
	os.setuid = None
	os.fork = None
	os.forkpty = None
	os.killpg = None
	os.rename = None
	os.renames = None
	os.truncate = None
	os.replace = None
	os.unlink = None
	os.fchmod = None
	os.fchown = None
	os.chmod = None
	os.chown = None
	os.chroot = None
	os.fchdir = None
	os.lchflags = None
	os.lchmod = None
	os.lchown = None
	os.getcwd = None
	os.chdir = None

	import shutil

	shutil.rmtree = None
	shutil.move = None
	shutil.chown = None

	import subprocess

	subprocess.Popen = None # type: ignore

	__builtins__["help"] = None

	import sys

	sys.modules["ipdb"] = None
	sys.modules["joblib"] = None
	sys.modules["resource"] = None
	sys.modules["psutil"] = None
	sys.modules["tkinter"] = None