#!/usr/bin/env python3 """ Generate a benchmark for testing LLM geometric reasoning abilities. This creates a "torture test" where we present 10 triangulations: - 1 is Delaunay realizable (has a valid point set) - 9 are NOT realizable (created via edge flips) The challenge: Given only the triangulation, can the LLM: 1. Produce a valid point set with that combinatorial structure, OR 2. Correctly identify that no such point set exists? We use pynauty for robust isomorphism checking to verify answers. """ import numpy as np import json import sys from pathlib import Path from scipy.spatial import Delaunay from datetime import datetime sys.path.insert(0, str(Path(__file__).parent.parent)) from ideal_poly_volume_toolkit.rivin_delaunay import ( check_delaunay_realizability, random_edge_flips, ) def triangulation_to_dict(triangles, label=""): """Convert triangulation to serializable dict.""" return { 'label': label, 'n_vertices': int(len(set(v for tri in triangles for v in tri))), 'n_triangles': int(len(triangles)), 'triangles': [[int(v) for v in tri] for tri in triangles], # Convert numpy int32 to Python int } def generate_benchmark(n_points=150, n_flips=40, n_non_realizable=5, seed=42): """ Generate LLM benchmark with 1 realizable + N non-realizable triangulations. Note: Finding valid non-realizable triangulations is challenging because most random edge flips either (a) create invalid triangulations (edges in >2 triangles) or (b) remain realizable. We aim for n_non_realizable but may get fewer. Args: n_points: Number of vertices n_flips: Number of edge flips per non-realizable triangulation n_non_realizable: Target number of non-realizable cases (may get fewer) seed: Random seed Returns: Dict with benchmark data """ np.random.seed(seed) print("="*70) print("GENERATING LLM GEOMETRIC REASONING BENCHMARK") print("="*70) print() # Generate random points and compute Delaunay triangulation print(f"Step 1: Generate {n_points} random points") points = np.random.rand(n_points, 2) print(f" ✓ Points generated") print() print("Step 2: Compute Delaunay triangulation") tri = Delaunay(points) realizable_triangulation = [tuple(simplex) for simplex in tri.simplices] print(f" ✓ Triangulation: {len(realizable_triangulation)} triangles") print() # Verify it's realizable print("Step 3: Verify realizability") result = check_delaunay_realizability(realizable_triangulation, verbose=False) if not result['realizable']: print(" ✗ ERROR: Base triangulation not realizable (unexpected!)") return None print(f" ✓ Confirmed realizable (min angle: {np.degrees(result['min_angle_radians']):.2f}°)") print() # Generate non-realizable triangulations via edge flips print(f"Step 4: Generate up to {n_non_realizable} non-realizable triangulations ({n_flips} flips each)") print(f" (Using check_delaunay_realizability() to verify each is non-realizable)") print(f" Note: Many edge flips create invalid triangulations or remain realizable") print() non_realizable_triangulations = [] attempts = 0 max_attempts = 1000 # Many attempts needed due to filtering while len(non_realizable_triangulations) < n_non_realizable and attempts < max_attempts: attempts += 1 # Try different numbers of flips to get variety flips_to_try = n_flips + (attempts % 40) - 20 # Vary between n_flips-20 to n_flips+20 flips_to_try = max(20, flips_to_try) flipped = random_edge_flips( realizable_triangulation, n_flips=flips_to_try, seed=seed + attempts ) # IMPORTANT: First check it's a VALID triangulation (no edge in >2 triangles) from collections import Counter edge_count = Counter() for tri in flipped: v0, v1, v2 = tri for edge in [tuple(sorted([v0, v1])), tuple(sorted([v1, v2])), tuple(sorted([v2, v0]))]: edge_count[edge] += 1 if any(count > 2 for count in edge_count.values()): # Invalid triangulation - skip it if attempts % 10 == 0: print(f" (attempt {attempts}: invalid triangulation after flips, skipping...)") continue # IMPORTANT: Verify it's actually non-realizable using Rivin's LP test result = check_delaunay_realizability(flipped, verbose=False) if not result['realizable']: # Confirmed non-realizable AND valid triangulation! non_realizable_triangulations.append(flipped) lp_status = "infeasible" if result.get('success') and not result['realizable'] else result.get('message', 'unknown') print(f" ✓ Non-realizable #{len(non_realizable_triangulations)}: " f"{len(flipped)} triangles, {flips_to_try} flips, LP: {lp_status}") else: # Still realizable after flips - skip it if attempts % 10 == 0: print(f" (attempt {attempts}: still realizable after {flips_to_try} flips, continuing...)") if len(non_realizable_triangulations) < n_non_realizable: print() print(f" ⚠ Warning: Only found {len(non_realizable_triangulations)}/{n_non_realizable} non-realizable triangulations") print(f" (This is expected - most edge flips create invalid or still-realizable triangulations)") print() # Package benchmark data print("Step 5: Package benchmark") challenges = [] # Add the realizable triangulation # IMPORTANT: Save the points as a certificate/proof challenges.append({ **triangulation_to_dict(realizable_triangulation, label="challenge_0"), 'is_realizable': True, 'solution_exists': True, 'certificate_points': points.tolist(), # The actual points that realize this triangulation }) # Add non-realizable triangulations for i, tri in enumerate(non_realizable_triangulations): challenges.append({ **triangulation_to_dict(tri, label=f"challenge_{i+1}"), 'is_realizable': False, 'solution_exists': False, }) # Shuffle challenges so the realizable one isn't always first np.random.seed(seed + 999) indices = np.arange(len(challenges)) np.random.shuffle(indices) challenges_shuffled = [challenges[i] for i in indices] benchmark = { 'metadata': { 'description': 'LLM Geometric Reasoning Benchmark', 'n_vertices': n_points, 'n_challenges': len(challenges_shuffled), 'n_realizable': 1, 'n_non_realizable': len(non_realizable_triangulations), 'target_non_realizable': n_non_realizable, 'note': 'Non-realizable count may be less than target due to edge flip constraints', 'generated': datetime.now().isoformat(), 'seed': seed, }, 'challenges': challenges_shuffled, 'instructions': ( "For each challenge, you are given a triangulation specified as a list of triangles. " "Each triangle is a tuple of three vertex indices. " "Your task: Either (1) produce a set of 2D points such that the Delaunay triangulation " "of those points has the same combinatorial structure as the given triangulation, " "OR (2) output 'None' if no such point set exists. " "Note: Vertex labels may permute - we check graph isomorphism using canonical forms." ), } print(f" ✓ Created {len(challenges_shuffled)} challenges") print(f" ✓ Challenges have been shuffled") print() return benchmark def main(): import argparse parser = argparse.ArgumentParser( description="Generate LLM geometric reasoning benchmark" ) parser.add_argument( "--points", type=int, default=150, help="Number of vertices (default: 150)", ) parser.add_argument( "--flips", type=int, default=40, help="Number of edge flips for non-realizable triangulations (default: 40)", ) parser.add_argument( "--non-realizable", type=int, default=5, help="Target number of non-realizable cases (default: 5, may get fewer)", ) parser.add_argument( "--seed", type=int, default=42, help="Random seed (default: 42)", ) parser.add_argument( "--output", type=str, default="llm_benchmark.json", help="Output JSON file (default: llm_benchmark.json)", ) args = parser.parse_args() print() print("#"*70) print("# LLM Geometric Reasoning Benchmark Generator") print("#"*70) print() benchmark = generate_benchmark( n_points=args.points, n_flips=args.flips, n_non_realizable=args.non_realizable, seed=args.seed ) if benchmark is None: print("✗ Benchmark generation failed") return 1 # Save to JSON output_path = Path(args.output) with open(output_path, 'w') as f: json.dump(benchmark, f, indent=2) print("="*70) print("BENCHMARK GENERATED") print("="*70) print(f"Output file: {output_path}") print(f"Total challenges: {len(benchmark['challenges'])}") print(f"Realizable: {benchmark['metadata']['n_realizable']}") print(f"Non-realizable: {benchmark['metadata']['n_non_realizable']}") print("="*70) return 0 if __name__ == "__main__": sys.exit(main())