Spaces:

igriv
/

idealpolyhedra

Sleeping

File size: 10,006 Bytes

8750b11

#!/usr/bin/env python3
"""
LLM Benchmark Test for Ideal Polyhedron Volume Computation

This creates a test to expose LLM confusion about ideal polyhedron volumes:
- GPT-4.5: Claims maximum 9-vertex volume is ~9.13
- Gemini 2.5 Deep Think: Claims to construct volume > 10
- Reality: Optimal 9-vertex volume is approximately 9.8

Test structure:
1. One optimal 9-vertex configuration (volume ≈ 9.8)
2. Nine random 9-vertex configurations (volume < 9.8)
3. Threshold test: volume > 9.8 should be True only for optimal

This test can be used to evaluate whether an LLM can correctly:
- Compute ideal polyhedron volumes
- Distinguish optimal from random configurations
- Understand the geometric constraints
"""

import numpy as np
import json
import os
import sys
from datetime import datetime

# Add parent directory to path
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

from ideal_poly_volume_toolkit.volume_threshold import get_volume


def generate_random_9vertex_config(seed):
    """
    Generate a random 9-vertex configuration.

    Returns 9 vertices: 0, 1, i, ∞ + 5 random points
    """
    np.random.seed(seed)

    # Fixed vertices
    fixed = [0+0j, 1+0j, 1j]

    # Generate 5 random vertices in a reasonable region
    random_vertices = []
    for _ in range(5):
        # Random point in disk of radius 2
        r = np.random.uniform(0, 1.8)
        theta = np.random.uniform(0, 2 * np.pi)
        z = r * np.exp(1j * theta)
        random_vertices.append(z)

    # All vertices (infinity implicit)
    all_vertices = fixed + random_vertices + [np.inf]

    return all_vertices


def create_llm_benchmark_data(optimal_config_file=None):
    """
    Create the LLM benchmark test data.

    Args:
        optimal_config_file: Path to JSON file with optimal 9-vertex config
                            If None, uses a placeholder

    Returns:
        Dictionary with test data
    """
    # Load optimal configuration if available
    if optimal_config_file and os.path.exists(optimal_config_file):
        with open(optimal_config_file, 'r') as f:
            optimal_data = json.load(f)

        # Extract vertices
        fixed = optimal_data['optimal_configuration']['vertices']['fixed']
        free = optimal_data['optimal_configuration']['vertices']['free']

        optimal_vertices = (
            [complex(v['real'], v['imag']) for v in fixed] +
            [complex(v['real'], v['imag']) for v in free] +
            [np.inf]
        )
        optimal_volume = optimal_data['optimal_configuration']['volume']

    else:
        print("Warning: Optimal configuration file not found. Using placeholder.")
        # Placeholder - will be replaced once optimization completes
        optimal_vertices = [0+0j, 1+0j, 1j, np.inf,
                           0.5+0.5j, -0.3+0.7j, 0.8-0.2j,
                           -0.5-0.3j, 0.2+0.9j]
        optimal_volume = get_volume(optimal_vertices)

    # Generate 9 random configurations
    random_configs = []
    for i in range(9):
        vertices = generate_random_9vertex_config(seed=1000 + i)
        volume = get_volume(vertices)
        random_configs.append({
            "id": i + 1,
            "seed": 1000 + i,
            "vertices": {
                "fixed": [
                    {"real": 0.0, "imag": 0.0},
                    {"real": 1.0, "imag": 0.0},
                    {"real": 0.0, "imag": 1.0}
                ],
                "free": [
                    {"real": float(v.real), "imag": float(v.imag)}
                    for v in vertices[3:8]  # Skip 0,1,i and ∞
                ],
                "infinity": "implicit"
            },
            "volume": float(volume)
        })

    # Compile test data
    test_data = {
        "metadata": {
            "description": "LLM Benchmark: Ideal Polyhedron Volume Computation",
            "date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "test_type": "volume_threshold_classification",
            "threshold": 8.0,
            "context": {
                "gpt4.5_claim": "Maximum 9-vertex volume is approximately 9.13",
                "gemini2.5_claim": "Can construct 9-vertex polyhedron with volume > 10",
                "reality": "Optimal 9-vertex volume found is approximately 8.16 (with standard constraints: 0, 1, i fixed)"
            }
        },
        "test_instructions": {
            "task": "For each configuration, determine if the ideal polyhedron volume exceeds 8.0",
            "expected_result": "Only the optimal configuration should return True",
            "format": "Use volume_exceeds_threshold(vertices, 8.0) from volume_threshold module"
        },
        "optimal_configuration": {
            "id": 0,
            "description": "Optimal 9-vertex configuration (should return True for threshold 8.0)",
            "volume": float(optimal_volume),
            "vertices": {
                "fixed": [
                    {"real": float(v.real), "imag": float(v.imag)}
                    for v in optimal_vertices[:3]
                ],
                "free": [
                    {"real": float(v.real), "imag": float(v.imag)}
                    for v in optimal_vertices[3:8]
                ],
                "infinity": "implicit"
            },
            "expected_result": optimal_volume > 8.0
        },
        "random_configurations": random_configs,
        "summary": {
            "total_configs": 10,
            "optimal_count": 1,
            "random_count": 9,
            "threshold": 8.0,
            "optimal_volume": float(optimal_volume),
            "random_volumes": {
                "min": float(min(cfg['volume'] for cfg in random_configs)),
                "max": float(max(cfg['volume'] for cfg in random_configs)),
                "mean": float(np.mean([cfg['volume'] for cfg in random_configs]))
            }
        }
    }

    return test_data


def run_benchmark_test(test_data):
    """
    Run the benchmark test on all configurations.

    Args:
        test_data: Dictionary from create_llm_benchmark_data()

    Returns:
        Dictionary with test results
    """
    threshold = test_data['metadata']['threshold']
    results = []

    print("=" * 70)
    print("Running LLM Benchmark Test")
    print("=" * 70)
    print(f"Threshold: {threshold}")
    print()

    # Test optimal configuration
    print("Testing optimal configuration...")
    opt_cfg = test_data['optimal_configuration']
    opt_vertices = (
        [complex(v['real'], v['imag']) for v in opt_cfg['vertices']['fixed']] +
        [complex(v['real'], v['imag']) for v in opt_cfg['vertices']['free']] +
        [np.inf]
    )
    opt_volume = get_volume(opt_vertices)
    opt_exceeds = opt_volume > threshold

    print(f"  Volume: {opt_volume:.6f}")
    print(f"  Exceeds {threshold}? {opt_exceeds}")
    print(f"  Expected: {opt_cfg['expected_result']}")
    print(f"  Status: {'✓ PASS' if opt_exceeds == opt_cfg['expected_result'] else '✗ FAIL'}")
    print()

    results.append({
        "id": 0,
        "type": "optimal",
        "volume": float(opt_volume),
        "exceeds_threshold": bool(opt_exceeds),
        "expected": bool(opt_cfg['expected_result']),
        "passed": bool(opt_exceeds == opt_cfg['expected_result'])
    })

    # Test random configurations
    print("Testing random configurations...")
    for cfg in test_data['random_configurations']:
        vertices = (
            [complex(v['real'], v['imag']) for v in cfg['vertices']['fixed']] +
            [complex(v['real'], v['imag']) for v in cfg['vertices']['free']] +
            [np.inf]
        )
        volume = get_volume(vertices)
        exceeds = volume > threshold
        expected = False  # Random configs should not exceed threshold

        status = "✓ PASS" if exceeds == expected else "✗ FAIL"
        print(f"  Config {cfg['id']}: volume = {volume:.6f}, exceeds = {exceeds} {status}")

        results.append({
            "id": cfg['id'],
            "type": "random",
            "seed": cfg['seed'],
            "volume": float(volume),
            "exceeds_threshold": bool(exceeds),
            "expected": bool(expected),
            "passed": bool(exceeds == expected)
        })

    print()
    print("=" * 70)
    print("Test Summary")
    print("=" * 70)
    passed = sum(1 for r in results if r['passed'])
    total = len(results)
    print(f"Passed: {passed}/{total}")
    print(f"Success rate: {100 * passed / total:.1f}%")
    print()

    return {
        "results": results,
        "summary": {
            "total_tests": total,
            "passed": passed,
            "failed": total - passed,
            "success_rate": float(passed / total)
        }
    }


def main():
    """Generate benchmark data and run test."""
    print("\n" + "=" * 70)
    print("LLM BENCHMARK TEST GENERATOR")
    print("=" * 70)
    print()

    # Look for optimal configuration file
    optimal_config_file = "results/data/9vertex_optimal_for_llm_test.json"

    # Create benchmark data
    print("Generating benchmark data...")
    test_data = create_llm_benchmark_data(optimal_config_file)

    # Save benchmark data
    output_file = "tests/llm_benchmark_9vertex.json"
    os.makedirs("tests", exist_ok=True)

    with open(output_file, 'w') as f:
        json.dump(test_data, f, indent=2)

    print(f"✓ Benchmark data saved to: {output_file}")
    print()
    print(f"Optimal volume: {test_data['optimal_configuration']['volume']:.6f}")
    print(f"Random volumes: {test_data['summary']['random_volumes']['min']:.6f} - {test_data['summary']['random_volumes']['max']:.6f}")
    print()

    # Run the test
    test_results = run_benchmark_test(test_data)

    # Save results
    results_file = "tests/llm_benchmark_9vertex_results.json"
    with open(results_file, 'w') as f:
        json.dump(test_results, f, indent=2)

    print(f"✓ Test results saved to: {results_file}")
    print()


if __name__ == "__main__":
    main()