Spaces:

qpluslab
/

OpenRA-Bench

Running

App Files Files Community

OpenRA-Bench / openra_bench /pairwise.py

yxc20098

Pairwise adversarial eval + Elo (the user's 'pairwise conditions')

b04adfc about 2 months ago

Raw

History Blame Contribute Delete

3.72 kB

	"""Pairwise adversarial evaluation + Elo.

	The project goal includes evaluating models "on pairwise adversarial
	conditions", not only fixed-scenario absolute scores. True co-located
	1v1 (both sides model-driven in one map) is the deeper engine feature
	(task #3); this module delivers the tractable, immediately-useful form:
	comparative pairwise ranking — every model plays the same
	controlled scenarios, and models are ranked by head-to-head composite
	on shared cells via Elo. Pure + deterministic (no engine in
	`pairwise_elo`), so it is fully unit-testable; `run_pairwise` wires it
	to the live evaluator.
	"""

	from __future__ import annotations

	import itertools
	from pathlib import Path
	from typing import Callable

	from .run_eval import AgentFactory, evaluate

	# Per model: {cell -> mean composite over that cell's seeds}.
	ModelCellScores = dict[str, dict[str, float]]

	ELO_K = 32.0
	ELO_START = 1000.0
	_TIE_EPS = 1e-6


	def _cell_means(stats: dict) -> dict[str, float]:
	"""Mean composite per cell from a run_eval stats dict (public split)."""
	by: dict[str, list[float]] = {}
	for e in stats.get("episodes", []):
	if e.get("split", "public") != "public":
	continue
	by.setdefault(e["cell"], []).append(e["composite"])
	return {c: sum(v) / len(v) for c, v in by.items() if v}


	def pairwise_elo(scores: ModelCellScores) -> dict:
	"""Deterministic Elo from comparative per-cell composites.

	For each unordered model pair, the match score = fraction of shared
	cells where A strictly beats B (ties = 0.5). One Elo update per
	pair; pairs processed in sorted order so the result is fully
	deterministic and order-independent given the input.
	"""
	models = sorted(scores)
	elo = {m: ELO_START for m in models}
	matrix: dict[str, dict[str, float]] = {m: {} for m in models}

	for a, b in itertools.combinations(models, 2):
	shared = sorted(set(scores[a]) & set(scores[b]))
	if not shared:
	continue
	wins = 0.0
	for c in shared:
	da = scores[a][c] - scores[b][c]
	if da > _TIE_EPS:
	wins += 1.0
	elif abs(da) <= _TIE_EPS:
	wins += 0.5
	sa = wins / len(shared) # A's match score in [0,1]
	matrix[a][b] = round(sa, 4)
	matrix[b][a] = round(1.0 - sa, 4)
	ea = 1.0 / (1.0 + 10.0 ** ((elo[b] - elo[a]) / 400.0))
	elo[a] += ELO_K * (sa - ea)
	elo[b] += ELO_K * ((1.0 - sa) - (1.0 - ea))

	ranked = sorted(models, key=lambda m: (-elo[m], m))
	return {
	"elo": {m: round(elo[m], 1) for m in models},
	"rank": {m: i + 1 for i, m in enumerate(ranked)},
	"matrix": matrix, # matrix[a][b] = A's match score vs B
	"shared_cells": {
	f"{a}\|{b}": len(set(scores[a]) & set(scores[b]))
	for a, b in itertools.combinations(models, 2)
	},
	}


	def run_pairwise(
	packs: list[Path],
	levels: list[str],
	seeds: list[int],
	agents: dict[str, AgentFactory],
	) -> dict:
	"""Run each named model over the same packs/levels/seeds and return
	the pairwise Elo ranking + per-model cell scores."""
	if len(agents) < 2:
	raise ValueError("pairwise eval needs >= 2 models")
	scores: ModelCellScores = {}
	per_model_stats: dict[str, dict] = {}
	for name, factory in agents.items():
	st = evaluate(packs, levels, seeds, agent_factory=factory)
	per_model_stats[name] = st
	scores[name] = _cell_means(st)
	return {
	"pairwise": pairwise_elo(scores),
	"cell_scores": scores,
	"per_model_overall": {
	n: s.get("overall", {}) for n, s in per_model_stats.items()
	},
	}