Spaces:

igriv
/

idealpolyhedra

Running on CPU Upgrade

App Files Files Community

idealpolyhedra / tests /llm_benchmark_test.py

igriv

Fix critical bug: remove i from fixed vertices (should be 0, 1, ∞ only)

8750b11 3 months ago

raw

history blame contribute delete

10 kB

	#!/usr/bin/env python3
	"""
	LLM Benchmark Test for Ideal Polyhedron Volume Computation

	This creates a test to expose LLM confusion about ideal polyhedron volumes:
	- GPT-4.5: Claims maximum 9-vertex volume is ~9.13
	- Gemini 2.5 Deep Think: Claims to construct volume > 10
	- Reality: Optimal 9-vertex volume is approximately 9.8

	Test structure:
	1. One optimal 9-vertex configuration (volume ≈ 9.8)
	2. Nine random 9-vertex configurations (volume < 9.8)
	3. Threshold test: volume > 9.8 should be True only for optimal

	This test can be used to evaluate whether an LLM can correctly:
	- Compute ideal polyhedron volumes
	- Distinguish optimal from random configurations
	- Understand the geometric constraints
	"""

	import numpy as np
	import json
	import os
	import sys
	from datetime import datetime

	# Add parent directory to path
	sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

	from ideal_poly_volume_toolkit.volume_threshold import get_volume


	def generate_random_9vertex_config(seed):
	"""
	Generate a random 9-vertex configuration.

	Returns 9 vertices: 0, 1, i, ∞ + 5 random points
	"""
	np.random.seed(seed)

	# Fixed vertices
	fixed = [0+0j, 1+0j, 1j]

	# Generate 5 random vertices in a reasonable region
	random_vertices = []
	for _ in range(5):
	# Random point in disk of radius 2
	r = np.random.uniform(0, 1.8)
	theta = np.random.uniform(0, 2 * np.pi)
	z = r * np.exp(1j * theta)
	random_vertices.append(z)

	# All vertices (infinity implicit)
	all_vertices = fixed + random_vertices + [np.inf]

	return all_vertices


	def create_llm_benchmark_data(optimal_config_file=None):
	"""
	Create the LLM benchmark test data.

	Args:
	optimal_config_file: Path to JSON file with optimal 9-vertex config
	If None, uses a placeholder

	Returns:
	Dictionary with test data
	"""
	# Load optimal configuration if available
	if optimal_config_file and os.path.exists(optimal_config_file):
	with open(optimal_config_file, 'r') as f:
	optimal_data = json.load(f)

	# Extract vertices
	fixed = optimal_data['optimal_configuration']['vertices']['fixed']
	free = optimal_data['optimal_configuration']['vertices']['free']

	optimal_vertices = (
	[complex(v['real'], v['imag']) for v in fixed] +
	[complex(v['real'], v['imag']) for v in free] +
	[np.inf]
	)
	optimal_volume = optimal_data['optimal_configuration']['volume']

	else:
	print("Warning: Optimal configuration file not found. Using placeholder.")
	# Placeholder - will be replaced once optimization completes
	optimal_vertices = [0+0j, 1+0j, 1j, np.inf,
	0.5+0.5j, -0.3+0.7j, 0.8-0.2j,
	-0.5-0.3j, 0.2+0.9j]
	optimal_volume = get_volume(optimal_vertices)

	# Generate 9 random configurations
	random_configs = []
	for i in range(9):
	vertices = generate_random_9vertex_config(seed=1000 + i)
	volume = get_volume(vertices)
	random_configs.append({
	"id": i + 1,
	"seed": 1000 + i,
	"vertices": {
	"fixed": [
	{"real": 0.0, "imag": 0.0},
	{"real": 1.0, "imag": 0.0},
	{"real": 0.0, "imag": 1.0}
	],
	"free": [
	{"real": float(v.real), "imag": float(v.imag)}
	for v in vertices[3:8] # Skip 0,1,i and ∞
	],
	"infinity": "implicit"
	},
	"volume": float(volume)
	})

	# Compile test data
	test_data = {
	"metadata": {
	"description": "LLM Benchmark: Ideal Polyhedron Volume Computation",
	"date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
	"test_type": "volume_threshold_classification",
	"threshold": 8.0,
	"context": {
	"gpt4.5_claim": "Maximum 9-vertex volume is approximately 9.13",
	"gemini2.5_claim": "Can construct 9-vertex polyhedron with volume > 10",
	"reality": "Optimal 9-vertex volume found is approximately 8.16 (with standard constraints: 0, 1, i fixed)"
	}
	},
	"test_instructions": {
	"task": "For each configuration, determine if the ideal polyhedron volume exceeds 8.0",
	"expected_result": "Only the optimal configuration should return True",
	"format": "Use volume_exceeds_threshold(vertices, 8.0) from volume_threshold module"
	},
	"optimal_configuration": {
	"id": 0,
	"description": "Optimal 9-vertex configuration (should return True for threshold 8.0)",
	"volume": float(optimal_volume),
	"vertices": {
	"fixed": [
	{"real": float(v.real), "imag": float(v.imag)}
	for v in optimal_vertices[:3]
	],
	"free": [
	{"real": float(v.real), "imag": float(v.imag)}
	for v in optimal_vertices[3:8]
	],
	"infinity": "implicit"
	},
	"expected_result": optimal_volume > 8.0
	},
	"random_configurations": random_configs,
	"summary": {
	"total_configs": 10,
	"optimal_count": 1,
	"random_count": 9,
	"threshold": 8.0,
	"optimal_volume": float(optimal_volume),
	"random_volumes": {
	"min": float(min(cfg['volume'] for cfg in random_configs)),
	"max": float(max(cfg['volume'] for cfg in random_configs)),
	"mean": float(np.mean([cfg['volume'] for cfg in random_configs]))
	}
	}
	}

	return test_data


	def run_benchmark_test(test_data):
	"""
	Run the benchmark test on all configurations.

	Args:
	test_data: Dictionary from create_llm_benchmark_data()

	Returns:
	Dictionary with test results
	"""
	threshold = test_data['metadata']['threshold']
	results = []

	print("=" * 70)
	print("Running LLM Benchmark Test")
	print("=" * 70)
	print(f"Threshold: {threshold}")
	print()

	# Test optimal configuration
	print("Testing optimal configuration...")
	opt_cfg = test_data['optimal_configuration']
	opt_vertices = (
	[complex(v['real'], v['imag']) for v in opt_cfg['vertices']['fixed']] +
	[complex(v['real'], v['imag']) for v in opt_cfg['vertices']['free']] +
	[np.inf]
	)
	opt_volume = get_volume(opt_vertices)
	opt_exceeds = opt_volume > threshold

	print(f" Volume: {opt_volume:.6f}")
	print(f" Exceeds {threshold}? {opt_exceeds}")
	print(f" Expected: {opt_cfg['expected_result']}")
	print(f" Status: {'✓ PASS' if opt_exceeds == opt_cfg['expected_result'] else '✗ FAIL'}")
	print()

	results.append({
	"id": 0,
	"type": "optimal",
	"volume": float(opt_volume),
	"exceeds_threshold": bool(opt_exceeds),
	"expected": bool(opt_cfg['expected_result']),
	"passed": bool(opt_exceeds == opt_cfg['expected_result'])
	})

	# Test random configurations
	print("Testing random configurations...")
	for cfg in test_data['random_configurations']:
	vertices = (
	[complex(v['real'], v['imag']) for v in cfg['vertices']['fixed']] +
	[complex(v['real'], v['imag']) for v in cfg['vertices']['free']] +
	[np.inf]
	)
	volume = get_volume(vertices)
	exceeds = volume > threshold
	expected = False # Random configs should not exceed threshold

	status = "✓ PASS" if exceeds == expected else "✗ FAIL"
	print(f" Config {cfg['id']}: volume = {volume:.6f}, exceeds = {exceeds} {status}")

	results.append({
	"id": cfg['id'],
	"type": "random",
	"seed": cfg['seed'],
	"volume": float(volume),
	"exceeds_threshold": bool(exceeds),
	"expected": bool(expected),
	"passed": bool(exceeds == expected)
	})

	print()
	print("=" * 70)
	print("Test Summary")
	print("=" * 70)
	passed = sum(1 for r in results if r['passed'])
	total = len(results)
	print(f"Passed: {passed}/{total}")
	print(f"Success rate: {100 * passed / total:.1f}%")
	print()

	return {
	"results": results,
	"summary": {
	"total_tests": total,
	"passed": passed,
	"failed": total - passed,
	"success_rate": float(passed / total)
	}
	}


	def main():
	"""Generate benchmark data and run test."""
	print("\n" + "=" * 70)
	print("LLM BENCHMARK TEST GENERATOR")
	print("=" * 70)
	print()

	# Look for optimal configuration file
	optimal_config_file = "results/data/9vertex_optimal_for_llm_test.json"

	# Create benchmark data
	print("Generating benchmark data...")
	test_data = create_llm_benchmark_data(optimal_config_file)

	# Save benchmark data
	output_file = "tests/llm_benchmark_9vertex.json"
	os.makedirs("tests", exist_ok=True)

	with open(output_file, 'w') as f:
	json.dump(test_data, f, indent=2)

	print(f"✓ Benchmark data saved to: {output_file}")
	print()
	print(f"Optimal volume: {test_data['optimal_configuration']['volume']:.6f}")
	print(f"Random volumes: {test_data['summary']['random_volumes']['min']:.6f} - {test_data['summary']['random_volumes']['max']:.6f}")
	print()

	# Run the test
	test_results = run_benchmark_test(test_data)

	# Save results
	results_file = "tests/llm_benchmark_9vertex_results.json"
	with open(results_file, 'w') as f:
	json.dump(test_results, f, indent=2)

	print(f"✓ Test results saved to: {results_file}")
	print()


	if __name__ == "__main__":
	main()