JustinTX
/

sky2

Model card Files Files and versions

sky2 / benchmarks /math /sums_diffs_finite_sets /evaluator /evaluator.py

JustinTX's picture

Add files using upload-large-folder tool

b0e88cf verified about 1 month ago

history blame contribute delete

3.17 kB

	# ===--------------------------------------------------------------------------------------===#
	#
	# This file implements the evaluator for the sums and differences of finite sets problem.
	#
	# ===--------------------------------------------------------------------------------------===#
	#
	# Some of the code in this file is adapted from:
	#
	# google-deepmind/alphaevolve_results:
	# Licensed under the Apache License v2.0.
	#
	# ===--------------------------------------------------------------------------------------===#

	import sys
	import os
	from importlib import __import__
	import time
	import numpy as np

	BENCHMARK = 1.158417281556896


	def verify_c6_solution(u_set: np.ndarray, c6_achieved: float):
	"""Verifies the C6 lower bound solution."""

	if not isinstance(u_set, np.ndarray) or u_set.ndim != 1:
	raise ValueError("Solution U must be a 1D numpy array of integers.")

	# Verify constraints
	if 0 not in u_set:
	raise ValueError("Set U must contain 0.")
	if np.any(u_set < 0):
	raise ValueError("Set U must contain non-negative integers.")

	# Re-calculate the C6 bound using NumPy
	u_plus_u = np.unique(u_set[:, None] + u_set[None, :])
	u_minus_u = np.unique(u_set[:, None] - u_set[None, :])

	size_U_plus_U = len(u_plus_u)
	size_U_minus_U = len(u_minus_u)
	max_U = np.max(u_set)

	ratio = size_U_minus_U / size_U_plus_U
	log_ratio = np.log(ratio)
	log_denom = np.log(2 * max_U + 1)

	computed_c6 = 1 + log_ratio / log_denom

	# Check for consistency
	if not np.isclose(computed_c6, c6_achieved):
	raise ValueError(f"C6 mismatch: reported {c6_achieved:.6f}, computed {computed_c6:.6f}")

	print(f"C6 lower bound achieved: {c6_achieved:.6f}")
	print(f"Known best bound (AlphaEvolve): {BENCHMARK}")

	if c6_achieved > BENCHMARK:
	print("Successfully found a new, better lower bound!")
	else:
	print("Result is not better than the known lower bounds.")


	def evaluate(program_path: str):
	try:
	abs_program_path = os.path.abspath(program_path)
	program_dir = os.path.dirname(abs_program_path)
	module_name = os.path.splitext(os.path.basename(program_path))[0]

	try:
	sys.path.insert(0, program_dir)
	program = __import__(module_name)
	start_time = time.time()
	u_set, c6_bound = program.run()
	end_time = time.time()
	eval_time = end_time - start_time
	finally:
	if program_dir in sys.path:
	sys.path.remove(program_dir)

	verify_c6_solution(u_set, c6_bound)

	return {
	"c6_bound": float(c6_bound),
	"combined_score": float(c6_bound) / BENCHMARK,
	"set_size": len(u_set),
	"max_val": int(np.max(u_set)),
	"eval_time": float(eval_time),
	}
	except Exception as e:
	return {"combined_score": 0.0, "error": str(e)}


	if __name__ == "__main__":
	# Backwards-compat: bridges old evaluate() -> dict to the container JSON
	# protocol. wrapper.py is copied from skydiscover/evaluation/wrapper.py.
	from wrapper import run

	run(evaluate)