Spaces:
Sleeping
Sleeping
File size: 10,006 Bytes
8750b11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 | #!/usr/bin/env python3
"""
LLM Benchmark Test for Ideal Polyhedron Volume Computation
This creates a test to expose LLM confusion about ideal polyhedron volumes:
- GPT-4.5: Claims maximum 9-vertex volume is ~9.13
- Gemini 2.5 Deep Think: Claims to construct volume > 10
- Reality: Optimal 9-vertex volume is approximately 9.8
Test structure:
1. One optimal 9-vertex configuration (volume β 9.8)
2. Nine random 9-vertex configurations (volume < 9.8)
3. Threshold test: volume > 9.8 should be True only for optimal
This test can be used to evaluate whether an LLM can correctly:
- Compute ideal polyhedron volumes
- Distinguish optimal from random configurations
- Understand the geometric constraints
"""
import numpy as np
import json
import os
import sys
from datetime import datetime
# Add parent directory to path
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from ideal_poly_volume_toolkit.volume_threshold import get_volume
def generate_random_9vertex_config(seed):
"""
Generate a random 9-vertex configuration.
Returns 9 vertices: 0, 1, i, β + 5 random points
"""
np.random.seed(seed)
# Fixed vertices
fixed = [0+0j, 1+0j, 1j]
# Generate 5 random vertices in a reasonable region
random_vertices = []
for _ in range(5):
# Random point in disk of radius 2
r = np.random.uniform(0, 1.8)
theta = np.random.uniform(0, 2 * np.pi)
z = r * np.exp(1j * theta)
random_vertices.append(z)
# All vertices (infinity implicit)
all_vertices = fixed + random_vertices + [np.inf]
return all_vertices
def create_llm_benchmark_data(optimal_config_file=None):
"""
Create the LLM benchmark test data.
Args:
optimal_config_file: Path to JSON file with optimal 9-vertex config
If None, uses a placeholder
Returns:
Dictionary with test data
"""
# Load optimal configuration if available
if optimal_config_file and os.path.exists(optimal_config_file):
with open(optimal_config_file, 'r') as f:
optimal_data = json.load(f)
# Extract vertices
fixed = optimal_data['optimal_configuration']['vertices']['fixed']
free = optimal_data['optimal_configuration']['vertices']['free']
optimal_vertices = (
[complex(v['real'], v['imag']) for v in fixed] +
[complex(v['real'], v['imag']) for v in free] +
[np.inf]
)
optimal_volume = optimal_data['optimal_configuration']['volume']
else:
print("Warning: Optimal configuration file not found. Using placeholder.")
# Placeholder - will be replaced once optimization completes
optimal_vertices = [0+0j, 1+0j, 1j, np.inf,
0.5+0.5j, -0.3+0.7j, 0.8-0.2j,
-0.5-0.3j, 0.2+0.9j]
optimal_volume = get_volume(optimal_vertices)
# Generate 9 random configurations
random_configs = []
for i in range(9):
vertices = generate_random_9vertex_config(seed=1000 + i)
volume = get_volume(vertices)
random_configs.append({
"id": i + 1,
"seed": 1000 + i,
"vertices": {
"fixed": [
{"real": 0.0, "imag": 0.0},
{"real": 1.0, "imag": 0.0},
{"real": 0.0, "imag": 1.0}
],
"free": [
{"real": float(v.real), "imag": float(v.imag)}
for v in vertices[3:8] # Skip 0,1,i and β
],
"infinity": "implicit"
},
"volume": float(volume)
})
# Compile test data
test_data = {
"metadata": {
"description": "LLM Benchmark: Ideal Polyhedron Volume Computation",
"date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"test_type": "volume_threshold_classification",
"threshold": 8.0,
"context": {
"gpt4.5_claim": "Maximum 9-vertex volume is approximately 9.13",
"gemini2.5_claim": "Can construct 9-vertex polyhedron with volume > 10",
"reality": "Optimal 9-vertex volume found is approximately 8.16 (with standard constraints: 0, 1, i fixed)"
}
},
"test_instructions": {
"task": "For each configuration, determine if the ideal polyhedron volume exceeds 8.0",
"expected_result": "Only the optimal configuration should return True",
"format": "Use volume_exceeds_threshold(vertices, 8.0) from volume_threshold module"
},
"optimal_configuration": {
"id": 0,
"description": "Optimal 9-vertex configuration (should return True for threshold 8.0)",
"volume": float(optimal_volume),
"vertices": {
"fixed": [
{"real": float(v.real), "imag": float(v.imag)}
for v in optimal_vertices[:3]
],
"free": [
{"real": float(v.real), "imag": float(v.imag)}
for v in optimal_vertices[3:8]
],
"infinity": "implicit"
},
"expected_result": optimal_volume > 8.0
},
"random_configurations": random_configs,
"summary": {
"total_configs": 10,
"optimal_count": 1,
"random_count": 9,
"threshold": 8.0,
"optimal_volume": float(optimal_volume),
"random_volumes": {
"min": float(min(cfg['volume'] for cfg in random_configs)),
"max": float(max(cfg['volume'] for cfg in random_configs)),
"mean": float(np.mean([cfg['volume'] for cfg in random_configs]))
}
}
}
return test_data
def run_benchmark_test(test_data):
"""
Run the benchmark test on all configurations.
Args:
test_data: Dictionary from create_llm_benchmark_data()
Returns:
Dictionary with test results
"""
threshold = test_data['metadata']['threshold']
results = []
print("=" * 70)
print("Running LLM Benchmark Test")
print("=" * 70)
print(f"Threshold: {threshold}")
print()
# Test optimal configuration
print("Testing optimal configuration...")
opt_cfg = test_data['optimal_configuration']
opt_vertices = (
[complex(v['real'], v['imag']) for v in opt_cfg['vertices']['fixed']] +
[complex(v['real'], v['imag']) for v in opt_cfg['vertices']['free']] +
[np.inf]
)
opt_volume = get_volume(opt_vertices)
opt_exceeds = opt_volume > threshold
print(f" Volume: {opt_volume:.6f}")
print(f" Exceeds {threshold}? {opt_exceeds}")
print(f" Expected: {opt_cfg['expected_result']}")
print(f" Status: {'β PASS' if opt_exceeds == opt_cfg['expected_result'] else 'β FAIL'}")
print()
results.append({
"id": 0,
"type": "optimal",
"volume": float(opt_volume),
"exceeds_threshold": bool(opt_exceeds),
"expected": bool(opt_cfg['expected_result']),
"passed": bool(opt_exceeds == opt_cfg['expected_result'])
})
# Test random configurations
print("Testing random configurations...")
for cfg in test_data['random_configurations']:
vertices = (
[complex(v['real'], v['imag']) for v in cfg['vertices']['fixed']] +
[complex(v['real'], v['imag']) for v in cfg['vertices']['free']] +
[np.inf]
)
volume = get_volume(vertices)
exceeds = volume > threshold
expected = False # Random configs should not exceed threshold
status = "β PASS" if exceeds == expected else "β FAIL"
print(f" Config {cfg['id']}: volume = {volume:.6f}, exceeds = {exceeds} {status}")
results.append({
"id": cfg['id'],
"type": "random",
"seed": cfg['seed'],
"volume": float(volume),
"exceeds_threshold": bool(exceeds),
"expected": bool(expected),
"passed": bool(exceeds == expected)
})
print()
print("=" * 70)
print("Test Summary")
print("=" * 70)
passed = sum(1 for r in results if r['passed'])
total = len(results)
print(f"Passed: {passed}/{total}")
print(f"Success rate: {100 * passed / total:.1f}%")
print()
return {
"results": results,
"summary": {
"total_tests": total,
"passed": passed,
"failed": total - passed,
"success_rate": float(passed / total)
}
}
def main():
"""Generate benchmark data and run test."""
print("\n" + "=" * 70)
print("LLM BENCHMARK TEST GENERATOR")
print("=" * 70)
print()
# Look for optimal configuration file
optimal_config_file = "results/data/9vertex_optimal_for_llm_test.json"
# Create benchmark data
print("Generating benchmark data...")
test_data = create_llm_benchmark_data(optimal_config_file)
# Save benchmark data
output_file = "tests/llm_benchmark_9vertex.json"
os.makedirs("tests", exist_ok=True)
with open(output_file, 'w') as f:
json.dump(test_data, f, indent=2)
print(f"β Benchmark data saved to: {output_file}")
print()
print(f"Optimal volume: {test_data['optimal_configuration']['volume']:.6f}")
print(f"Random volumes: {test_data['summary']['random_volumes']['min']:.6f} - {test_data['summary']['random_volumes']['max']:.6f}")
print()
# Run the test
test_results = run_benchmark_test(test_data)
# Save results
results_file = "tests/llm_benchmark_9vertex_results.json"
with open(results_file, 'w') as f:
json.dump(test_results, f, indent=2)
print(f"β Test results saved to: {results_file}")
print()
if __name__ == "__main__":
main()
|