Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| #!/usr/bin/env python3 | |
| """ | |
| LLM Benchmark Test for Ideal Polyhedron Volume Computation | |
| This creates a test to expose LLM confusion about ideal polyhedron volumes: | |
| - GPT-4.5: Claims maximum 9-vertex volume is ~9.13 | |
| - Gemini 2.5 Deep Think: Claims to construct volume > 10 | |
| - Reality: Optimal 9-vertex volume is approximately 9.8 | |
| Test structure: | |
| 1. One optimal 9-vertex configuration (volume β 9.8) | |
| 2. Nine random 9-vertex configurations (volume < 9.8) | |
| 3. Threshold test: volume > 9.8 should be True only for optimal | |
| This test can be used to evaluate whether an LLM can correctly: | |
| - Compute ideal polyhedron volumes | |
| - Distinguish optimal from random configurations | |
| - Understand the geometric constraints | |
| """ | |
| import numpy as np | |
| import json | |
| import os | |
| import sys | |
| from datetime import datetime | |
| # Add parent directory to path | |
| sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) | |
| from ideal_poly_volume_toolkit.volume_threshold import get_volume | |
| def generate_random_9vertex_config(seed): | |
| """ | |
| Generate a random 9-vertex configuration. | |
| Returns 9 vertices: 0, 1, i, β + 5 random points | |
| """ | |
| np.random.seed(seed) | |
| # Fixed vertices | |
| fixed = [0+0j, 1+0j, 1j] | |
| # Generate 5 random vertices in a reasonable region | |
| random_vertices = [] | |
| for _ in range(5): | |
| # Random point in disk of radius 2 | |
| r = np.random.uniform(0, 1.8) | |
| theta = np.random.uniform(0, 2 * np.pi) | |
| z = r * np.exp(1j * theta) | |
| random_vertices.append(z) | |
| # All vertices (infinity implicit) | |
| all_vertices = fixed + random_vertices + [np.inf] | |
| return all_vertices | |
| def create_llm_benchmark_data(optimal_config_file=None): | |
| """ | |
| Create the LLM benchmark test data. | |
| Args: | |
| optimal_config_file: Path to JSON file with optimal 9-vertex config | |
| If None, uses a placeholder | |
| Returns: | |
| Dictionary with test data | |
| """ | |
| # Load optimal configuration if available | |
| if optimal_config_file and os.path.exists(optimal_config_file): | |
| with open(optimal_config_file, 'r') as f: | |
| optimal_data = json.load(f) | |
| # Extract vertices | |
| fixed = optimal_data['optimal_configuration']['vertices']['fixed'] | |
| free = optimal_data['optimal_configuration']['vertices']['free'] | |
| optimal_vertices = ( | |
| [complex(v['real'], v['imag']) for v in fixed] + | |
| [complex(v['real'], v['imag']) for v in free] + | |
| [np.inf] | |
| ) | |
| optimal_volume = optimal_data['optimal_configuration']['volume'] | |
| else: | |
| print("Warning: Optimal configuration file not found. Using placeholder.") | |
| # Placeholder - will be replaced once optimization completes | |
| optimal_vertices = [0+0j, 1+0j, 1j, np.inf, | |
| 0.5+0.5j, -0.3+0.7j, 0.8-0.2j, | |
| -0.5-0.3j, 0.2+0.9j] | |
| optimal_volume = get_volume(optimal_vertices) | |
| # Generate 9 random configurations | |
| random_configs = [] | |
| for i in range(9): | |
| vertices = generate_random_9vertex_config(seed=1000 + i) | |
| volume = get_volume(vertices) | |
| random_configs.append({ | |
| "id": i + 1, | |
| "seed": 1000 + i, | |
| "vertices": { | |
| "fixed": [ | |
| {"real": 0.0, "imag": 0.0}, | |
| {"real": 1.0, "imag": 0.0}, | |
| {"real": 0.0, "imag": 1.0} | |
| ], | |
| "free": [ | |
| {"real": float(v.real), "imag": float(v.imag)} | |
| for v in vertices[3:8] # Skip 0,1,i and β | |
| ], | |
| "infinity": "implicit" | |
| }, | |
| "volume": float(volume) | |
| }) | |
| # Compile test data | |
| test_data = { | |
| "metadata": { | |
| "description": "LLM Benchmark: Ideal Polyhedron Volume Computation", | |
| "date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), | |
| "test_type": "volume_threshold_classification", | |
| "threshold": 8.0, | |
| "context": { | |
| "gpt4.5_claim": "Maximum 9-vertex volume is approximately 9.13", | |
| "gemini2.5_claim": "Can construct 9-vertex polyhedron with volume > 10", | |
| "reality": "Optimal 9-vertex volume found is approximately 8.16 (with standard constraints: 0, 1, i fixed)" | |
| } | |
| }, | |
| "test_instructions": { | |
| "task": "For each configuration, determine if the ideal polyhedron volume exceeds 8.0", | |
| "expected_result": "Only the optimal configuration should return True", | |
| "format": "Use volume_exceeds_threshold(vertices, 8.0) from volume_threshold module" | |
| }, | |
| "optimal_configuration": { | |
| "id": 0, | |
| "description": "Optimal 9-vertex configuration (should return True for threshold 8.0)", | |
| "volume": float(optimal_volume), | |
| "vertices": { | |
| "fixed": [ | |
| {"real": float(v.real), "imag": float(v.imag)} | |
| for v in optimal_vertices[:3] | |
| ], | |
| "free": [ | |
| {"real": float(v.real), "imag": float(v.imag)} | |
| for v in optimal_vertices[3:8] | |
| ], | |
| "infinity": "implicit" | |
| }, | |
| "expected_result": optimal_volume > 8.0 | |
| }, | |
| "random_configurations": random_configs, | |
| "summary": { | |
| "total_configs": 10, | |
| "optimal_count": 1, | |
| "random_count": 9, | |
| "threshold": 8.0, | |
| "optimal_volume": float(optimal_volume), | |
| "random_volumes": { | |
| "min": float(min(cfg['volume'] for cfg in random_configs)), | |
| "max": float(max(cfg['volume'] for cfg in random_configs)), | |
| "mean": float(np.mean([cfg['volume'] for cfg in random_configs])) | |
| } | |
| } | |
| } | |
| return test_data | |
| def run_benchmark_test(test_data): | |
| """ | |
| Run the benchmark test on all configurations. | |
| Args: | |
| test_data: Dictionary from create_llm_benchmark_data() | |
| Returns: | |
| Dictionary with test results | |
| """ | |
| threshold = test_data['metadata']['threshold'] | |
| results = [] | |
| print("=" * 70) | |
| print("Running LLM Benchmark Test") | |
| print("=" * 70) | |
| print(f"Threshold: {threshold}") | |
| print() | |
| # Test optimal configuration | |
| print("Testing optimal configuration...") | |
| opt_cfg = test_data['optimal_configuration'] | |
| opt_vertices = ( | |
| [complex(v['real'], v['imag']) for v in opt_cfg['vertices']['fixed']] + | |
| [complex(v['real'], v['imag']) for v in opt_cfg['vertices']['free']] + | |
| [np.inf] | |
| ) | |
| opt_volume = get_volume(opt_vertices) | |
| opt_exceeds = opt_volume > threshold | |
| print(f" Volume: {opt_volume:.6f}") | |
| print(f" Exceeds {threshold}? {opt_exceeds}") | |
| print(f" Expected: {opt_cfg['expected_result']}") | |
| print(f" Status: {'β PASS' if opt_exceeds == opt_cfg['expected_result'] else 'β FAIL'}") | |
| print() | |
| results.append({ | |
| "id": 0, | |
| "type": "optimal", | |
| "volume": float(opt_volume), | |
| "exceeds_threshold": bool(opt_exceeds), | |
| "expected": bool(opt_cfg['expected_result']), | |
| "passed": bool(opt_exceeds == opt_cfg['expected_result']) | |
| }) | |
| # Test random configurations | |
| print("Testing random configurations...") | |
| for cfg in test_data['random_configurations']: | |
| vertices = ( | |
| [complex(v['real'], v['imag']) for v in cfg['vertices']['fixed']] + | |
| [complex(v['real'], v['imag']) for v in cfg['vertices']['free']] + | |
| [np.inf] | |
| ) | |
| volume = get_volume(vertices) | |
| exceeds = volume > threshold | |
| expected = False # Random configs should not exceed threshold | |
| status = "β PASS" if exceeds == expected else "β FAIL" | |
| print(f" Config {cfg['id']}: volume = {volume:.6f}, exceeds = {exceeds} {status}") | |
| results.append({ | |
| "id": cfg['id'], | |
| "type": "random", | |
| "seed": cfg['seed'], | |
| "volume": float(volume), | |
| "exceeds_threshold": bool(exceeds), | |
| "expected": bool(expected), | |
| "passed": bool(exceeds == expected) | |
| }) | |
| print() | |
| print("=" * 70) | |
| print("Test Summary") | |
| print("=" * 70) | |
| passed = sum(1 for r in results if r['passed']) | |
| total = len(results) | |
| print(f"Passed: {passed}/{total}") | |
| print(f"Success rate: {100 * passed / total:.1f}%") | |
| print() | |
| return { | |
| "results": results, | |
| "summary": { | |
| "total_tests": total, | |
| "passed": passed, | |
| "failed": total - passed, | |
| "success_rate": float(passed / total) | |
| } | |
| } | |
| def main(): | |
| """Generate benchmark data and run test.""" | |
| print("\n" + "=" * 70) | |
| print("LLM BENCHMARK TEST GENERATOR") | |
| print("=" * 70) | |
| print() | |
| # Look for optimal configuration file | |
| optimal_config_file = "results/data/9vertex_optimal_for_llm_test.json" | |
| # Create benchmark data | |
| print("Generating benchmark data...") | |
| test_data = create_llm_benchmark_data(optimal_config_file) | |
| # Save benchmark data | |
| output_file = "tests/llm_benchmark_9vertex.json" | |
| os.makedirs("tests", exist_ok=True) | |
| with open(output_file, 'w') as f: | |
| json.dump(test_data, f, indent=2) | |
| print(f"β Benchmark data saved to: {output_file}") | |
| print() | |
| print(f"Optimal volume: {test_data['optimal_configuration']['volume']:.6f}") | |
| print(f"Random volumes: {test_data['summary']['random_volumes']['min']:.6f} - {test_data['summary']['random_volumes']['max']:.6f}") | |
| print() | |
| # Run the test | |
| test_results = run_benchmark_test(test_data) | |
| # Save results | |
| results_file = "tests/llm_benchmark_9vertex_results.json" | |
| with open(results_file, 'w') as f: | |
| json.dump(test_results, f, indent=2) | |
| print(f"β Test results saved to: {results_file}") | |
| print() | |
| if __name__ == "__main__": | |
| main() | |