0x960 / train /benchmark_eval.py
qtzx06's picture
feat: finalize swarm tooling and submission artifacts
eac9d9f
"""Benchmark two Chess960 eval functions against each other."""
from __future__ import annotations
import argparse
import importlib.util
import math
import random
from collections.abc import Callable
from dataclasses import asdict, dataclass
from pathlib import Path
import chess
from zero960.engine.match import play_game
EvalFn = Callable[[chess.Board], int]
@dataclass(slots=True)
class BenchmarkResult:
candidate_path: Path
baseline_path: Path
positions: int
depth: int
max_plies: int
seed: int
wins: int
draws: int
losses: int
points: float
total_games: int
score: float
elo_delta_estimate: float
def to_json(self) -> dict[str, object]:
payload = asdict(self)
payload["candidate_path"] = str(self.candidate_path)
payload["baseline_path"] = str(self.baseline_path)
return payload
def _load_eval(path: Path) -> EvalFn:
spec = importlib.util.spec_from_file_location(f"zero960_benchmark_{path.stem}", path)
if spec is None or spec.loader is None:
raise RuntimeError(f"failed to load module from {path}")
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
evaluate = getattr(module, "evaluate", None)
if evaluate is None or not callable(evaluate):
raise RuntimeError(f"{path} does not define evaluate(board)")
return evaluate
def _sample_positions(count: int, seed: int) -> list[int]:
rng = random.Random(seed)
population = list(range(960))
if count <= len(population):
return rng.sample(population, count)
return [rng.choice(population) for _ in range(count)]
def _elo_from_score(score: float) -> float:
clipped = min(max(score, 0.01), 0.99)
return -400.0 * math.log10((1.0 / clipped) - 1.0)
def benchmark_eval_files(
candidate_path: Path,
baseline_path: Path,
*,
positions: int = 64,
depth: int = 2,
max_plies: int = 120,
seed: int = 42,
) -> BenchmarkResult:
candidate_eval = _load_eval(candidate_path)
baseline_eval = _load_eval(baseline_path)
start_positions = _sample_positions(positions, seed)
wins = 0
draws = 0
losses = 0
points = 0.0
for chess960_index in start_positions:
white_result = play_game(
chess960_index,
candidate_eval,
baseline_eval,
depth=depth,
max_plies=max_plies,
)
points += white_result
if white_result == 1.0:
wins += 1
elif white_result == 0.5:
draws += 1
else:
losses += 1
black_result = 1.0 - play_game(
chess960_index,
baseline_eval,
candidate_eval,
depth=depth,
max_plies=max_plies,
)
points += black_result
if black_result == 1.0:
wins += 1
elif black_result == 0.5:
draws += 1
else:
losses += 1
total_games = len(start_positions) * 2
score = points / total_games if total_games else 0.0
return BenchmarkResult(
candidate_path=candidate_path,
baseline_path=baseline_path,
positions=len(start_positions),
depth=depth,
max_plies=max_plies,
seed=seed,
wins=wins,
draws=draws,
losses=losses,
points=points,
total_games=total_games,
score=score,
elo_delta_estimate=_elo_from_score(score),
)
def parse_args() -> argparse.Namespace:
root = Path(__file__).resolve().parents[1]
parser = argparse.ArgumentParser(description="Benchmark two Chess960 eval functions.")
parser.add_argument(
"--candidate-file",
default=str(root / "src/zero960/workspace_template/eval.py"),
help="Path to the candidate eval.py file.",
)
parser.add_argument(
"--baseline-file",
default=str(root / "src/zero960/engine/default_eval.py"),
help="Path to the baseline eval.py file.",
)
parser.add_argument("--positions", type=int, default=64)
parser.add_argument("--depth", type=int, default=2)
parser.add_argument("--max-plies", type=int, default=120)
parser.add_argument("--seed", type=int, default=42)
return parser.parse_args()
def main() -> None:
args = parse_args()
candidate_path = Path(args.candidate_file).resolve()
baseline_path = Path(args.baseline_file).resolve()
result = benchmark_eval_files(
candidate_path,
baseline_path,
positions=args.positions,
depth=args.depth,
max_plies=args.max_plies,
seed=args.seed,
)
print(f"candidate: {candidate_path}")
print(f"baseline: {baseline_path}")
print(
f"positions={result.positions} depth={result.depth} max_plies={result.max_plies} "
f"games={result.total_games} seed={result.seed}"
)
print(
f"record={result.wins}-{result.draws}-{result.losses} "
f"points={result.points:.1f}/{result.total_games}"
)
print(f"score={result.score:.3f} elo_delta_estimate={result.elo_delta_estimate:.1f}")
if __name__ == "__main__":
main()