Spaces:

lainwired
/

jaxaht-benchmark

Running

App Files Files Community

jaxaht-benchmark / evaluation /eval_api.py

lainwired

Initial jaxaht-benchmark deployment

5146e76 10 days ago

raw

history blame contribute delete

23.4 kB

	"""Hanabi AHT evaluation API. Upload checkpoint, get scores vs held-out partners."""
	import argparse
	import json
	import os
	import shutil
	import sys
	import tempfile
	import time
	import uuid
	import threading
	import zipfile

	sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

	import jax
	import jax.numpy as jnp
	import numpy as np
	from flask import Flask, jsonify, request, send_from_directory
	from flask_cors import CORS

	jax.config.update('jax_platform_name', 'cpu')

	from envs import make_env
	from envs.log_wrapper import LogWrapper
	from common.run_episodes import run_episodes
	from agents.hanabi.bc_lstm_agent import BCLSTMAgent
	from agents.hanabi.agent_policy_wrappers import (
	HanabiBCLSTMPolicyWrapper,
	HanabiIGGIPolicyWrapper,
	HanabiPiersPolicyWrapper,
	HanabiSmartBotPolicyWrapper,
	HanabiRandomPolicyWrapper,
	HanabiOBLPolicyWrapper,
	HanabiRuleBasedPolicyWrapper,
	HanabiFlawedPolicyWrapper,
	HanabiOuterPolicyWrapper,
	HanabiVanDenBerghPolicyWrapper,
	)

	app = Flask(__name__)
	app.config['MAX_CONTENT_LENGTH'] = 200 * 1024 * 1024 # 200 MB upload limit
	CORS(app)


	DEFAULT_NUM_EPISODES = 64
	MAX_EPISODE_STEPS = 128
	LEADERBOARD_FILE = os.path.join(
	os.path.dirname(os.path.abspath(__file__)), 'leaderboard.json'
	)

	HAND_SIZE, NUM_COLORS, NUM_RANKS, NUM_ACTIONS = 5, 5, 5, 21
	_RB_KW = dict(hand_size=HAND_SIZE, num_colors=NUM_COLORS, num_ranks=NUM_RANKS,
	num_actions=NUM_ACTIONS, using_log_wrapper=True)

	def _obl_path(level, seed='a'):
	load = '_LOAD1' if level > 1 else ''
	return f'agents/hanabi/obl-r2d2-flax/icml_OBL{level}/OFF_BELIEF1_SHUFFLE_COLOR0{load}_BZA0_BELIEF_{seed}.safetensors'

	PARTNER_REGISTRY = {
	# Baselines
	'random': {
	'name': 'Random',
	'description': 'Uniform random legal action baseline (0/25)',
	'factory': lambda: HanabiRandomPolicyWrapper(num_actions=NUM_ACTIONS, using_log_wrapper=True),
	},
	# Rule-based strategy variants (action-category priority agents)
	'rule_based_cautious': {
	'name': 'RuleBased (cautious)',
	'description': 'Priority: play first, then discard, then hint',
	'factory': lambda: HanabiRuleBasedPolicyWrapper(strategy='cautious', **_RB_KW),
	},
	'rule_based_aggressive': {
	'name': 'RuleBased (aggressive)',
	'description': 'Priority: play only, fallback to hint',
	'factory': lambda: HanabiRuleBasedPolicyWrapper(strategy='aggressive', **_RB_KW),
	},
	'rule_based_communicative': {
	'name': 'RuleBased (communicative)',
	'description': 'Priority: hint first, then play',
	'factory': lambda: HanabiRuleBasedPolicyWrapper(strategy='communicative', **_RB_KW),
	},
	'rule_based_frugal': {
	'name': 'RuleBased (frugal)',
	'description': 'Priority: discard first to save info tokens',
	'factory': lambda: HanabiRuleBasedPolicyWrapper(strategy='frugal', **_RB_KW),
	},
	# Walton-Rivers agents
	'iggi': {
	'name': 'IGGI',
	'description': 'Game-state-aware rule-based (Walton-Rivers 2017). 11.7/25 self-play.',
	'factory': lambda: HanabiIGGIPolicyWrapper(**_RB_KW),
	},
	'piers': {
	'name': 'Piers',
	'description': 'IGGI + probabilistic play (>60%) + dispensable hints. 11.6/25.',
	'factory': lambda: HanabiPiersPolicyWrapper(**_RB_KW),
	},
	'van_den_bergh': {
	'name': 'Van Den Bergh',
	'description': 'Probabilistic discard ordering to protect valuable cards. 9.3/25.',
	'factory': lambda: HanabiVanDenBerghPolicyWrapper(**_RB_KW),
	},
	'outer': {
	'name': 'Outer',
	'description': 'Hint-heavy agent. Scores 0/25 on full Hanabi (exhausts tokens).',
	'factory': lambda: HanabiOuterPolicyWrapper(**_RB_KW),
	},
	'flawed': {
	'name': 'Flawed (30%)',
	'description': 'IGGI with 30% random-mistake probability. 0.2/25.',
	'factory': lambda: HanabiFlawedPolicyWrapper(mistake_prob=0.3, **_RB_KW),
	},
	'': {
	'name': 'Flawed (60%)',
	'description': 'IGGI with 60% random-mistake probability. ~0/25.',
	'factory': lambda: HanabiFlawedPolicyWrapper(mistake_prob=0.6, **_RB_KW),
	},
	# SmartBot (strongest rule-based)
	'smartbot': {
	'name': 'SmartBot',
	'description': 'Convention-heavy rule-based agent. 18.0/25 self-play, 0.4 vs human proxy.',
	'factory': lambda: HanabiSmartBotPolicyWrapper(**_RB_KW),
	},
	# OBL R2D2 pretrained, all 5 belief levels (seed-a)
	'obl_l1': {
	'name': 'OBL-L1',
	'description': 'Off-Belief Learning level 1 (Hu et al. 2021). 20.9/25 self-play.',
	'factory': lambda: HanabiOBLPolicyWrapper(weight_file=_obl_path(1), using_log_wrapper=True),
	},
	'obl_l2': {
	'name': 'OBL-L2',
	'description': 'Off-Belief Learning level 2.',
	'factory': lambda: HanabiOBLPolicyWrapper(weight_file=_obl_path(2), using_log_wrapper=True),
	},
	'obl_l3': {
	'name': 'OBL-L3',
	'description': 'Off-Belief Learning level 3.',
	'factory': lambda: HanabiOBLPolicyWrapper(weight_file=_obl_path(3), using_log_wrapper=True),
	},
	'obl_l4': {
	'name': 'OBL-L4',
	'description': 'Off-Belief Learning level 4. 24.3/25 self-play (near-optimal).',
	'factory': lambda: HanabiOBLPolicyWrapper(weight_file=_obl_path(4), using_log_wrapper=True),
	},
	'obl_l5': {
	'name': 'OBL-L5',
	'description': 'Off-Belief Learning level 5.',
	'factory': lambda: HanabiOBLPolicyWrapper(weight_file=_obl_path(5), using_log_wrapper=True),
	},
	# Human proxy
	'bc_lstm': {
	'name': 'BC-LSTM Human Proxy',
	'description': 'Behavioral cloning LSTM trained on 1K human games (AH2AC2). 45.5% accuracy.',
	'factory': lambda: HanabiBCLSTMPolicyWrapper(
	weight_file='agents/hanabi/bc_lstm_weights/bc_2p.safetensors',
	using_log_wrapper=True, greedy=True),
	},
	}

	# Trained ego agents (RL checkpoints). Returns (policy, params) not just policy.
	# Separate from PARTNER_REGISTRY because these carry learned parameters.
	_ego_cache = {}

	IPPO_FIXTURE = os.path.join(
	os.path.dirname(os.path.abspath(__file__)),
	'fixtures/hanabi_ippo_1e9_seed0/saved_train_run'
	)

	def _cpu_safe_restore(path):
	"""Restore an orbax checkpoint with explicit CPU sharding.

	Needed because fixtures saved on GPU machines encode GPU sharding,
	which orbax rejects when restoring on CPU-only machines (the eval
	API host). Bypasses common.save_load_utils.load_train_run which
	uses implicit sharding.
	"""
	import orbax.checkpoint as ocp
	if not os.path.isabs(path):
	from common.save_load_utils import REPO_PATH
	path = os.path.join(REPO_PATH, path)
	checkpointer = ocp.PyTreeCheckpointer()
	cpu_sharding = jax.sharding.SingleDeviceSharding(jax.devices('cpu')[0])
	meta = checkpointer.metadata(path)
	restore_args = jax.tree.map(
	lambda _: ocp.ArrayRestoreArgs(sharding=cpu_sharding),
	meta,
	)
	restored = checkpointer.restore(path, restore_args=restore_args)
	# Convert to jax arrays
	restored = jax.tree_util.tree_map(
	lambda x: jnp.array(x) if isinstance(x, np.ndarray) else x,
	restored,
	)
	return restored


	def _load_trained_ego(fixture_path, actor_type='mlp', hidden_dim=512,
	activation='relu', idx_list=None):
	"""Load trained ego from orbax checkpoint. Cached after first load."""
	cache_key = (fixture_path, actor_type, hidden_dim, activation,
	tuple(idx_list) if idx_list else None)
	if cache_key in _ego_cache:
	return _ego_cache[cache_key]

	from evaluation.heldout_evaluator import extract_params
	from agents.mlp_actor_critic_agent import MLPActorCriticPolicy

	env = make_env('hanabi')
	env = LogWrapper(env)

	# Use CPU-safe restore instead of the default loader
	restored = _cpu_safe_restore(fixture_path)
	ckpts = restored['checkpoints']

	# Build policy directly. JaxMARL Hanabi's observation_space.shape is a
	# bare int (658), not a tuple, which breaks initialize_mlp_agent's
	# `.shape[0]` access. Hardcode obs_dim=658 for full Hanabi.
	obs_dim = 658
	action_dim = 21
	policy = MLPActorCriticPolicy(
	action_dim=action_dim,
	obs_dim=obs_dim,
	activation=activation,
	)
	init_params = policy.init_params(jax.random.PRNGKey(0))

	# Extract the requested checkpoint. Default to the first.
	idx = idx_list if idx_list is not None else [0]
	params_list, _ = extract_params(ckpts, init_params, idx_labels=None)
	params = params_list[idx[0]] if idx[0] < len(params_list) else params_list[-1]

	_ego_cache[cache_key] = (policy, params)
	return policy, params


	def _extract_and_find_checkpoint(zip_path, extract_dir):
	"""Extract a zipped orbax checkpoint and locate the saved_train_run directory.

	Returns (saved_train_run_path, optional_config_dict).
	Raises ValueError if the structure is invalid.
	"""
	# Reject zip bombs: cap at 500 MB uncompressed
	MAX_UNCOMPRESSED = 500 * 1024 * 1024
	with zipfile.ZipFile(zip_path, 'r') as zf:
	total = sum(info.file_size for info in zf.infolist())
	if total > MAX_UNCOMPRESSED:
	raise ValueError(
	f"Uncompressed size {total} bytes exceeds {MAX_UNCOMPRESSED} limit"
	)
	# Reject absolute paths and traversal
	for info in zf.infolist():
	if info.filename.startswith('/') or '..' in info.filename.split('/'):
	raise ValueError(f"Unsafe path in zip: {info.filename}")
	zf.extractall(extract_dir)

	# Find saved_train_run directory anywhere in the extracted tree
	saved_run = None
	config_path = None
	for root, dirs, files in os.walk(extract_dir):
	if 'saved_train_run' in dirs:
	saved_run = os.path.join(root, 'saved_train_run')
	if 'config.yaml' in files:
	config_path = os.path.join(root, 'config.yaml')
	if saved_run and config_path:
	break

	if saved_run is None:
	# Maybe the zip IS the saved_train_run directory (user zipped its contents)
	# Look for orbax markers
	for root, dirs, files in os.walk(extract_dir):
	if '_CHECKPOINT_METADATA' in files or '_METADATA' in files:
	saved_run = root
	break

	if saved_run is None:
	raise ValueError(
	"Could not find saved_train_run/ directory or orbax checkpoint "
	"in the uploaded zip. Expected structure: saved_train_run/ with "
	"orbax files, optionally alongside config.yaml."
	)

	config_dict = None
	if config_path:
	try:
	from omegaconf import OmegaConf
	config_dict = OmegaConf.to_container(OmegaConf.load(config_path), resolve=True)
	except Exception:
	config_dict = None

	return saved_run, config_dict

	# Ego registry: any agent in PARTNER_REGISTRY can also be used as ego
	# (heuristic agents have no params, just a factory). Trained RL checkpoints
	# have their own entries with fixture paths.
	EGO_REGISTRY = {
	# Heuristic/pretrained agents (reuse PARTNER_REGISTRY factories at dispatch time)
	key: {'name': info['name'], 'description': info['description'], 'trained': False}
	for key, info in PARTNER_REGISTRY.items()
	}

	# Trained RL checkpoints (orbax fixtures)
	EGO_REGISTRY['ippo_1e9'] = {
	'name': 'IPPO (trained, 1e9)',
	'description': '3-seed IPPO trained 1e9 steps. 15.78/25 self-play, 0-5/25 with unseen partners.',
	'trained': True,
	'fixture': IPPO_FIXTURE,
	}


	def evaluate_ego_vs_partner(ego_policy, ego_params, partner_policy,
	num_episodes=64, rng_seed=34957):
	env = make_env('hanabi')
	env = LogWrapper(env)

	rng = jax.random.PRNGKey(rng_seed)

	metrics = run_episodes(
	rng, env,
	agent_0_policy=ego_policy,
	agent_0_param=ego_params,
	agent_1_policy=partner_policy,
	agent_1_param=None,
	max_episode_steps=MAX_EPISODE_STEPS,
	num_eps=num_episodes,
	agent_0_test_mode=True,
	agent_1_test_mode=False,
	)

	returns = np.array(metrics['returned_episode_returns'])
	episode_scores = returns.mean(axis=-1)

	return {
	'mean': float(np.mean(episode_scores)),
	'median': float(np.median(episode_scores)),
	'std': float(np.std(episode_scores)),
	'min': float(np.min(episode_scores)),
	'max': float(np.max(episode_scores)),
	'num_episodes': int(num_episodes),
	'per_episode': episode_scores.tolist(),
	}



	_leaderboard_lock = threading.Lock()


	def load_leaderboard():
	if os.path.exists(LEADERBOARD_FILE):
	with open(LEADERBOARD_FILE) as f:
	return json.load(f)
	return []


	def save_leaderboard(entries):
	with open(LEADERBOARD_FILE, 'w') as f:
	json.dump(entries, f, indent=2)


	def add_leaderboard_entry(entry):
	with _leaderboard_lock:
	entries = load_leaderboard()
	entries.append(entry)
	entries.sort(key=lambda x: -x.get('mean_score', 0))
	save_leaderboard(entries)
	return entries


	@app.route('/api/partners', methods=['GET'])
	def list_partners():
	result = {}
	for key, info in PARTNER_REGISTRY.items():
	result[key] = {
	'name': info['name'],
	'description': info['description'],
	}
	return jsonify(result)


	@app.route('/api/egos', methods=['GET'])
	def list_egos():
	result = {}
	for key, info in EGO_REGISTRY.items():
	result[key] = {
	'name': info['name'],
	'description': info['description'],
	'trained': info['trained'],
	}
	return jsonify(result)


	@app.route('/api/evaluate_upload', methods=['POST'])
	def evaluate_upload():
	"""Evaluate an uploaded orbax checkpoint zip against a held-out partner.

	Multipart form fields:
	checkpoint: .zip of orbax checkpoint (must contain saved_train_run/
	directory, optionally with config.yaml at the root)
	agent_name: str (label for leaderboard)
	partner: str (partner key from PARTNER_REGISTRY)
	num_episodes: int (default DEFAULT_NUM_EPISODES)
	actor_type: str (default 'mlp', overridden by config.yaml if present)
	hidden_dim: int (default 512, overridden by config.yaml if present)
	activation: str (default 'relu', overridden by config.yaml if present)
	"""
	checkpoint_file = request.files.get('checkpoint')
	if checkpoint_file is None:
	return jsonify({'error': 'No checkpoint file uploaded'}), 400
	if not checkpoint_file.filename.lower().endswith('.zip'):
	return jsonify({'error': 'Checkpoint must be a .zip archive'}), 400

	partner_key = request.form.get('partner', 'bc_lstm')
	if partner_key not in PARTNER_REGISTRY:
	return jsonify({'error': f'Unknown partner: {partner_key}'}), 400

	agent_name = request.form.get('agent_name') or 'Uploaded checkpoint'
	num_episodes = min(int(request.form.get('num_episodes', DEFAULT_NUM_EPISODES)), 1000)
	actor_type = request.form.get('actor_type', 'mlp')
	hidden_dim = int(request.form.get('hidden_dim', 512))
	activation = request.form.get('activation', 'relu')

	# Extract to a temp directory
	upload_dir = os.path.join(os.path.dirname(LEADERBOARD_FILE), 'uploads')
	os.makedirs(upload_dir, exist_ok=True)
	temp_zip = os.path.join(upload_dir, f'{uuid.uuid4().hex}.zip')
	extract_dir = tempfile.mkdtemp(prefix='eval_upload_', dir=upload_dir)
	try:
	checkpoint_file.save(temp_zip)

	try:
	saved_run_path, cfg = _extract_and_find_checkpoint(temp_zip, extract_dir)
	except ValueError as e:
	return jsonify({'error': f'Invalid checkpoint zip: {str(e)}'}), 400

	# Pull architecture from config.yaml if present
	if cfg and 'algorithm' in cfg:
	algo = cfg['algorithm']
	actor_type = algo.get('ACTOR_TYPE', actor_type)
	hidden_dim = int(algo.get('FC_HIDDEN_DIM', hidden_dim))
	activation = algo.get('ACTIVATION', activation)

	# Load the checkpoint
	try:
	ego_policy, ego_params = _load_trained_ego(
	saved_run_path, actor_type=actor_type,
	hidden_dim=hidden_dim, activation=activation,
	)
	except Exception as e:
	return jsonify({
	'error': f'Failed to load checkpoint: {str(e)}',
	'hint': ('Verify actor_type, hidden_dim, and activation match the '
	'architecture the checkpoint was trained with. Include '
	'config.yaml in the zip to auto-detect.'),
	}), 400

	partner_policy = PARTNER_REGISTRY[partner_key]['factory']()

	start_time = time.time()
	try:
	results = evaluate_ego_vs_partner(
	ego_policy, ego_params, partner_policy,
	num_episodes=num_episodes,
	)
	except Exception as e:
	return jsonify({'error': f'Evaluation failed: {str(e)}'}), 500
	elapsed = time.time() - start_time

	submission_id = str(uuid.uuid4())[:8]
	entry = {
	'id': submission_id,
	'agent_name': agent_name,
	'partner': partner_key,
	'partner_name': PARTNER_REGISTRY[partner_key]['name'],
	'mean_score': results['mean'],
	'median_score': results['median'],
	'std': results['std'],
	'min_score': results['min'],
	'max_score': results['max'],
	'num_episodes': results['num_episodes'],
	'ego_type': 'uploaded',
	'ego_name': f'Uploaded ({actor_type}, h={hidden_dim})',
	'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
	'eval_time_seconds': round(elapsed, 1),
	}
	leaderboard = add_leaderboard_entry(entry)

	response = {
	**entry,
	'per_episode_scores': results['per_episode'],
	'leaderboard_position': next(
	(i + 1 for i, e in enumerate(leaderboard) if e['id'] == submission_id),
	len(leaderboard),
	),
	'total_submissions': len(leaderboard),
	'architecture': {'actor_type': actor_type, 'hidden_dim': hidden_dim,
	'activation': activation},
	}
	return jsonify(response)
	finally:
	# Clean up temp files
	if os.path.exists(temp_zip):
	os.remove(temp_zip)
	if os.path.exists(extract_dir):
	shutil.rmtree(extract_dir, ignore_errors=True)


	@app.route('/api/evaluate', methods=['POST'])
	def evaluate():
	data = request.json or {}
	agent_name = data.get('agent_name', 'Anonymous')
	partner_key = data.get('partner', 'bc_lstm')
	num_episodes = min(int(data.get('num_episodes', DEFAULT_NUM_EPISODES)), 1000)
	ego_key = data.get('ego', 'iggi')

	if partner_key not in PARTNER_REGISTRY:
	return jsonify({'error': f'Unknown partner: {partner_key}',
	'available': list(PARTNER_REGISTRY.keys())}), 400

	if ego_key not in EGO_REGISTRY:
	return jsonify({'error': f'Unknown ego: {ego_key}',
	'available': list(EGO_REGISTRY.keys())}), 400

	partner_policy = PARTNER_REGISTRY[partner_key]['factory']()

	ego_info = EGO_REGISTRY[ego_key]
	if ego_info['trained']:
	fixture = ego_info['fixture']
	if not os.path.exists(fixture):
	return jsonify({'error': f'Checkpoint not found: {fixture}'}), 404
	try:
	ego_policy, ego_params = _load_trained_ego(fixture)
	except Exception as e:
	return jsonify({'error': f'Failed to load checkpoint: {str(e)}'}), 500
	else:
	if ego_key not in PARTNER_REGISTRY:
	return jsonify({'error': f'No factory for ego: {ego_key}'}), 400
	ego_policy = PARTNER_REGISTRY[ego_key]['factory']()
	ego_params = None

	start_time = time.time()
	try:
	results = evaluate_ego_vs_partner(
	ego_policy, ego_params, partner_policy,
	num_episodes=num_episodes
	)
	except Exception as e:
	return jsonify({'error': f'Evaluation failed: {str(e)}'}), 500
	elapsed = time.time() - start_time

	submission_id = str(uuid.uuid4())[:8]
	entry = {
	'id': submission_id,
	'agent_name': agent_name,
	'partner': partner_key,
	'partner_name': PARTNER_REGISTRY[partner_key]['name'],
	'mean_score': results['mean'],
	'median_score': results['median'],
	'std': results['std'],
	'min_score': results['min'],
	'max_score': results['max'],
	'num_episodes': results['num_episodes'],
	'ego_type': ego_key,
	'ego_name': EGO_REGISTRY[ego_key]['name'],
	'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
	'eval_time_seconds': round(elapsed, 1),
	}
	leaderboard = add_leaderboard_entry(entry)

	response = {
	**entry,
	'per_episode_scores': results['per_episode'],
	'leaderboard_position': next(
	(i + 1 for i, e in enumerate(leaderboard) if e['id'] == submission_id),
	len(leaderboard)
	),
	'total_submissions': len(leaderboard),
	}
	return jsonify(response)


	@app.route('/api/leaderboard', methods=['GET'])
	def leaderboard():
	partner_filter = request.args.get('partner')
	entries = load_leaderboard()
	if partner_filter:
	entries = [e for e in entries if e.get('partner') == partner_filter]
	return jsonify({
	'entries': entries,
	'total': len(entries),
	})


	@app.route('/api/leaderboard/clear', methods=['POST'])
	def clear_leaderboard():
	with _leaderboard_lock:
	save_leaderboard([])
	return jsonify({'status': 'cleared'})


	@app.route('/api/info', methods=['GET'])
	def api_info():
	return jsonify({
	'service': 'Hanabi AHT Policy Evaluation API',
	'version': '1.0',
	'endpoints': {
	'POST /api/evaluate': 'Submit a policy for evaluation',
	'GET /api/leaderboard': 'View submission rankings',
	'GET /api/partners': 'List available evaluation partners',
	},
	'supported_partners': list(PARTNER_REGISTRY.keys()),
	'default_num_episodes': DEFAULT_NUM_EPISODES,
	'leaderboard_file': LEADERBOARD_FILE,
	})


	@app.route('/')
	def index():
	return send_from_directory(
	os.path.join(os.path.dirname(os.path.abspath(__file__)), 'eval_ui'),
	'index.html'
	)


	if __name__ == '__main__':
	parser = argparse.ArgumentParser(description='Hanabi AHT Evaluation API')
	parser.add_argument('--port', type=int, default=5001)
	parser.add_argument('--host', type=str, default='0.0.0.0')
	parser.add_argument('--num-episodes', type=int, default=64)
	args = parser.parse_args()

	DEFAULT_NUM_EPISODES = args.num_episodes

	print(f'Hanabi AHT Evaluation API')
	print(f' http://{args.host}:{args.port}')
	print(f' Partners: {", ".join(PARTNER_REGISTRY.keys())}')
	print(f' Default episodes: {DEFAULT_NUM_EPISODES}')
	app.run(host=args.host, port=args.port, debug=False)