Spaces:

igriv
/

idealpolyhedra

Sleeping

App Files Files Community

idealpolyhedra / examples /llm_benchmark /run_benchmark.py

igriv

Add LLM benchmark care package for testing volume predictions

a509947 3 months ago

raw

history blame contribute delete

3.76 kB

	#!/usr/bin/env python3
	"""
	LLM Benchmark Driver: Test volume threshold predictions.

	This script loads test configurations and checks which ones exceed
	the volume threshold. Designed to expose LLM confusion about volumes.

	Usage:
	python run_benchmark.py
	"""

	import json
	import sys
	import os
	import numpy as np

	# Add parent directory to path
	sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))

	from ideal_poly_volume_toolkit.geometry import ideal_poly_volume_via_delaunay


	def load_test_suite(filepath="test_data/llm_benchmark_test_suite.json"):
	"""Load the test suite from JSON file."""
	with open(filepath, 'r') as f:
	return json.load(f)


	def run_test(config, threshold):
	"""
	Run a single test case.

	Args:
	config: Configuration dict with vertices and expected results
	threshold: Volume threshold to test against

	Returns:
	dict: Test results
	"""
	# Extract vertices and convert to complex numpy array
	vertices = np.array([complex(v['real'], v['imag']) for v in config['vertices']])

	# Compute volume using Bloch-Wigner (most accurate)
	actual_volume = ideal_poly_volume_via_delaunay(vertices, use_bloch_wigner=True)

	# Check if volume exceeds threshold
	exceeds = actual_volume > threshold

	# Expected result
	expected = config['should_exceed_threshold']

	return {
	"id": config['id'],
	"description": config['description'],
	"actual_volume": actual_volume,
	"expected_volume": config.get('expected_volume', None),
	"threshold": threshold,
	"exceeds_threshold": exceeds,
	"expected_to_exceed": expected,
	"passed": exceeds == expected
	}


	def main():
	print("=" * 70)
	print("LLM BENCHMARK: Ideal Polyhedron Volume Test")
	print("=" * 70)

	# Load test suite
	print("\nLoading test suite...")
	test_suite = load_test_suite()
	threshold = test_suite['metadata']['threshold']
	n_configs = test_suite['metadata']['n_configurations']

	print(f" Threshold: {threshold}")
	print(f" Configurations: {n_configs}")

	# Run tests
	print(f"\n{'=' * 70}")
	print("Running tests...")
	print(f"{'=' * 70}\n")

	results = []
	for config in test_suite['configurations']:
	result = run_test(config, threshold)
	results.append(result)

	# Print result
	status = "✓ PASS" if result['passed'] else "✗ FAIL"
	exceeds_str = "EXCEEDS" if result['exceeds_threshold'] else "below"

	print(f"{status} \| {result['id']:15s} \| Volume: {result['actual_volume']:.6f} \| {exceeds_str} threshold")

	# Summary
	print(f"\n{'=' * 70}")
	print("SUMMARY")
	print(f"{'=' * 70}")

	n_passed = sum(1 for r in results if r['passed'])
	n_failed = sum(1 for r in results if not r['passed'])

	print(f"Total tests: {len(results)}")
	print(f"Passed: {n_passed}")
	print(f"Failed: {n_failed}")

	if n_failed == 0:
	print("\n🎉 All tests passed!")
	else:
	print(f"\n⚠️ {n_failed} test(s) failed!")

	# Detailed results
	optimal_result = results[0]
	random_results = results[1:]

	print(f"\nOptimal configuration:")
	print(f" Volume: {optimal_result['actual_volume']:.8f}")
	print(f" Exceeds {threshold}: {optimal_result['exceeds_threshold']}")

	print(f"\nRandom configurations:")
	print(f" Max volume: {max(r['actual_volume'] for r in random_results):.8f}")
	print(f" Min volume: {min(r['actual_volume'] for r in random_results):.8f}")
	print(f" Any exceed {threshold}: {any(r['exceeds_threshold'] for r in random_results)}")

	return 0 if n_failed == 0 else 1


	if __name__ == "__main__":
	sys.exit(main())