idealpolyhedra / tests /llm_benchmark_test.py
igriv's picture
Fix critical bug: remove i from fixed vertices (should be 0, 1, ∞ only)
8750b11
#!/usr/bin/env python3
"""
LLM Benchmark Test for Ideal Polyhedron Volume Computation
This creates a test to expose LLM confusion about ideal polyhedron volumes:
- GPT-4.5: Claims maximum 9-vertex volume is ~9.13
- Gemini 2.5 Deep Think: Claims to construct volume > 10
- Reality: Optimal 9-vertex volume is approximately 9.8
Test structure:
1. One optimal 9-vertex configuration (volume β‰ˆ 9.8)
2. Nine random 9-vertex configurations (volume < 9.8)
3. Threshold test: volume > 9.8 should be True only for optimal
This test can be used to evaluate whether an LLM can correctly:
- Compute ideal polyhedron volumes
- Distinguish optimal from random configurations
- Understand the geometric constraints
"""
import numpy as np
import json
import os
import sys
from datetime import datetime
# Add parent directory to path
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from ideal_poly_volume_toolkit.volume_threshold import get_volume
def generate_random_9vertex_config(seed):
"""
Generate a random 9-vertex configuration.
Returns 9 vertices: 0, 1, i, ∞ + 5 random points
"""
np.random.seed(seed)
# Fixed vertices
fixed = [0+0j, 1+0j, 1j]
# Generate 5 random vertices in a reasonable region
random_vertices = []
for _ in range(5):
# Random point in disk of radius 2
r = np.random.uniform(0, 1.8)
theta = np.random.uniform(0, 2 * np.pi)
z = r * np.exp(1j * theta)
random_vertices.append(z)
# All vertices (infinity implicit)
all_vertices = fixed + random_vertices + [np.inf]
return all_vertices
def create_llm_benchmark_data(optimal_config_file=None):
"""
Create the LLM benchmark test data.
Args:
optimal_config_file: Path to JSON file with optimal 9-vertex config
If None, uses a placeholder
Returns:
Dictionary with test data
"""
# Load optimal configuration if available
if optimal_config_file and os.path.exists(optimal_config_file):
with open(optimal_config_file, 'r') as f:
optimal_data = json.load(f)
# Extract vertices
fixed = optimal_data['optimal_configuration']['vertices']['fixed']
free = optimal_data['optimal_configuration']['vertices']['free']
optimal_vertices = (
[complex(v['real'], v['imag']) for v in fixed] +
[complex(v['real'], v['imag']) for v in free] +
[np.inf]
)
optimal_volume = optimal_data['optimal_configuration']['volume']
else:
print("Warning: Optimal configuration file not found. Using placeholder.")
# Placeholder - will be replaced once optimization completes
optimal_vertices = [0+0j, 1+0j, 1j, np.inf,
0.5+0.5j, -0.3+0.7j, 0.8-0.2j,
-0.5-0.3j, 0.2+0.9j]
optimal_volume = get_volume(optimal_vertices)
# Generate 9 random configurations
random_configs = []
for i in range(9):
vertices = generate_random_9vertex_config(seed=1000 + i)
volume = get_volume(vertices)
random_configs.append({
"id": i + 1,
"seed": 1000 + i,
"vertices": {
"fixed": [
{"real": 0.0, "imag": 0.0},
{"real": 1.0, "imag": 0.0},
{"real": 0.0, "imag": 1.0}
],
"free": [
{"real": float(v.real), "imag": float(v.imag)}
for v in vertices[3:8] # Skip 0,1,i and ∞
],
"infinity": "implicit"
},
"volume": float(volume)
})
# Compile test data
test_data = {
"metadata": {
"description": "LLM Benchmark: Ideal Polyhedron Volume Computation",
"date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"test_type": "volume_threshold_classification",
"threshold": 8.0,
"context": {
"gpt4.5_claim": "Maximum 9-vertex volume is approximately 9.13",
"gemini2.5_claim": "Can construct 9-vertex polyhedron with volume > 10",
"reality": "Optimal 9-vertex volume found is approximately 8.16 (with standard constraints: 0, 1, i fixed)"
}
},
"test_instructions": {
"task": "For each configuration, determine if the ideal polyhedron volume exceeds 8.0",
"expected_result": "Only the optimal configuration should return True",
"format": "Use volume_exceeds_threshold(vertices, 8.0) from volume_threshold module"
},
"optimal_configuration": {
"id": 0,
"description": "Optimal 9-vertex configuration (should return True for threshold 8.0)",
"volume": float(optimal_volume),
"vertices": {
"fixed": [
{"real": float(v.real), "imag": float(v.imag)}
for v in optimal_vertices[:3]
],
"free": [
{"real": float(v.real), "imag": float(v.imag)}
for v in optimal_vertices[3:8]
],
"infinity": "implicit"
},
"expected_result": optimal_volume > 8.0
},
"random_configurations": random_configs,
"summary": {
"total_configs": 10,
"optimal_count": 1,
"random_count": 9,
"threshold": 8.0,
"optimal_volume": float(optimal_volume),
"random_volumes": {
"min": float(min(cfg['volume'] for cfg in random_configs)),
"max": float(max(cfg['volume'] for cfg in random_configs)),
"mean": float(np.mean([cfg['volume'] for cfg in random_configs]))
}
}
}
return test_data
def run_benchmark_test(test_data):
"""
Run the benchmark test on all configurations.
Args:
test_data: Dictionary from create_llm_benchmark_data()
Returns:
Dictionary with test results
"""
threshold = test_data['metadata']['threshold']
results = []
print("=" * 70)
print("Running LLM Benchmark Test")
print("=" * 70)
print(f"Threshold: {threshold}")
print()
# Test optimal configuration
print("Testing optimal configuration...")
opt_cfg = test_data['optimal_configuration']
opt_vertices = (
[complex(v['real'], v['imag']) for v in opt_cfg['vertices']['fixed']] +
[complex(v['real'], v['imag']) for v in opt_cfg['vertices']['free']] +
[np.inf]
)
opt_volume = get_volume(opt_vertices)
opt_exceeds = opt_volume > threshold
print(f" Volume: {opt_volume:.6f}")
print(f" Exceeds {threshold}? {opt_exceeds}")
print(f" Expected: {opt_cfg['expected_result']}")
print(f" Status: {'βœ“ PASS' if opt_exceeds == opt_cfg['expected_result'] else 'βœ— FAIL'}")
print()
results.append({
"id": 0,
"type": "optimal",
"volume": float(opt_volume),
"exceeds_threshold": bool(opt_exceeds),
"expected": bool(opt_cfg['expected_result']),
"passed": bool(opt_exceeds == opt_cfg['expected_result'])
})
# Test random configurations
print("Testing random configurations...")
for cfg in test_data['random_configurations']:
vertices = (
[complex(v['real'], v['imag']) for v in cfg['vertices']['fixed']] +
[complex(v['real'], v['imag']) for v in cfg['vertices']['free']] +
[np.inf]
)
volume = get_volume(vertices)
exceeds = volume > threshold
expected = False # Random configs should not exceed threshold
status = "βœ“ PASS" if exceeds == expected else "βœ— FAIL"
print(f" Config {cfg['id']}: volume = {volume:.6f}, exceeds = {exceeds} {status}")
results.append({
"id": cfg['id'],
"type": "random",
"seed": cfg['seed'],
"volume": float(volume),
"exceeds_threshold": bool(exceeds),
"expected": bool(expected),
"passed": bool(exceeds == expected)
})
print()
print("=" * 70)
print("Test Summary")
print("=" * 70)
passed = sum(1 for r in results if r['passed'])
total = len(results)
print(f"Passed: {passed}/{total}")
print(f"Success rate: {100 * passed / total:.1f}%")
print()
return {
"results": results,
"summary": {
"total_tests": total,
"passed": passed,
"failed": total - passed,
"success_rate": float(passed / total)
}
}
def main():
"""Generate benchmark data and run test."""
print("\n" + "=" * 70)
print("LLM BENCHMARK TEST GENERATOR")
print("=" * 70)
print()
# Look for optimal configuration file
optimal_config_file = "results/data/9vertex_optimal_for_llm_test.json"
# Create benchmark data
print("Generating benchmark data...")
test_data = create_llm_benchmark_data(optimal_config_file)
# Save benchmark data
output_file = "tests/llm_benchmark_9vertex.json"
os.makedirs("tests", exist_ok=True)
with open(output_file, 'w') as f:
json.dump(test_data, f, indent=2)
print(f"βœ“ Benchmark data saved to: {output_file}")
print()
print(f"Optimal volume: {test_data['optimal_configuration']['volume']:.6f}")
print(f"Random volumes: {test_data['summary']['random_volumes']['min']:.6f} - {test_data['summary']['random_volumes']['max']:.6f}")
print()
# Run the test
test_results = run_benchmark_test(test_data)
# Save results
results_file = "tests/llm_benchmark_9vertex_results.json"
with open(results_file, 'w') as f:
json.dump(test_results, f, indent=2)
print(f"βœ“ Test results saved to: {results_file}")
print()
if __name__ == "__main__":
main()