Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| LLM Benchmark Driver: Test volume threshold predictions. | |
| This script loads test configurations and checks which ones exceed | |
| the volume threshold. Designed to expose LLM confusion about volumes. | |
| Usage: | |
| python run_benchmark.py | |
| """ | |
| import json | |
| import sys | |
| import os | |
| import numpy as np | |
| # Add parent directory to path | |
| sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) | |
| from ideal_poly_volume_toolkit.geometry import ideal_poly_volume_via_delaunay | |
| def load_test_suite(filepath="test_data/llm_benchmark_test_suite.json"): | |
| """Load the test suite from JSON file.""" | |
| with open(filepath, 'r') as f: | |
| return json.load(f) | |
| def run_test(config, threshold): | |
| """ | |
| Run a single test case. | |
| Args: | |
| config: Configuration dict with vertices and expected results | |
| threshold: Volume threshold to test against | |
| Returns: | |
| dict: Test results | |
| """ | |
| # Extract vertices and convert to complex numpy array | |
| vertices = np.array([complex(v['real'], v['imag']) for v in config['vertices']]) | |
| # Compute volume using Bloch-Wigner (most accurate) | |
| actual_volume = ideal_poly_volume_via_delaunay(vertices, use_bloch_wigner=True) | |
| # Check if volume exceeds threshold | |
| exceeds = actual_volume > threshold | |
| # Expected result | |
| expected = config['should_exceed_threshold'] | |
| return { | |
| "id": config['id'], | |
| "description": config['description'], | |
| "actual_volume": actual_volume, | |
| "expected_volume": config.get('expected_volume', None), | |
| "threshold": threshold, | |
| "exceeds_threshold": exceeds, | |
| "expected_to_exceed": expected, | |
| "passed": exceeds == expected | |
| } | |
| def main(): | |
| print("=" * 70) | |
| print("LLM BENCHMARK: Ideal Polyhedron Volume Test") | |
| print("=" * 70) | |
| # Load test suite | |
| print("\nLoading test suite...") | |
| test_suite = load_test_suite() | |
| threshold = test_suite['metadata']['threshold'] | |
| n_configs = test_suite['metadata']['n_configurations'] | |
| print(f" Threshold: {threshold}") | |
| print(f" Configurations: {n_configs}") | |
| # Run tests | |
| print(f"\n{'=' * 70}") | |
| print("Running tests...") | |
| print(f"{'=' * 70}\n") | |
| results = [] | |
| for config in test_suite['configurations']: | |
| result = run_test(config, threshold) | |
| results.append(result) | |
| # Print result | |
| status = "✓ PASS" if result['passed'] else "✗ FAIL" | |
| exceeds_str = "EXCEEDS" if result['exceeds_threshold'] else "below" | |
| print(f"{status} | {result['id']:15s} | Volume: {result['actual_volume']:.6f} | {exceeds_str} threshold") | |
| # Summary | |
| print(f"\n{'=' * 70}") | |
| print("SUMMARY") | |
| print(f"{'=' * 70}") | |
| n_passed = sum(1 for r in results if r['passed']) | |
| n_failed = sum(1 for r in results if not r['passed']) | |
| print(f"Total tests: {len(results)}") | |
| print(f"Passed: {n_passed}") | |
| print(f"Failed: {n_failed}") | |
| if n_failed == 0: | |
| print("\n🎉 All tests passed!") | |
| else: | |
| print(f"\n⚠️ {n_failed} test(s) failed!") | |
| # Detailed results | |
| optimal_result = results[0] | |
| random_results = results[1:] | |
| print(f"\nOptimal configuration:") | |
| print(f" Volume: {optimal_result['actual_volume']:.8f}") | |
| print(f" Exceeds {threshold}: {optimal_result['exceeds_threshold']}") | |
| print(f"\nRandom configurations:") | |
| print(f" Max volume: {max(r['actual_volume'] for r in random_results):.8f}") | |
| print(f" Min volume: {min(r['actual_volume'] for r in random_results):.8f}") | |
| print(f" Any exceed {threshold}: {any(r['exceeds_threshold'] for r in random_results)}") | |
| return 0 if n_failed == 0 else 1 | |
| if __name__ == "__main__": | |
| sys.exit(main()) | |