Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| #!/usr/bin/env python3 | |
| """ | |
| Generate a benchmark for testing LLM geometric reasoning abilities. | |
| This creates a "torture test" where we present 10 triangulations: | |
| - 1 is Delaunay realizable (has a valid point set) | |
| - 9 are NOT realizable (created via edge flips) | |
| The challenge: Given only the triangulation, can the LLM: | |
| 1. Produce a valid point set with that combinatorial structure, OR | |
| 2. Correctly identify that no such point set exists? | |
| We use pynauty for robust isomorphism checking to verify answers. | |
| """ | |
| import numpy as np | |
| import json | |
| import sys | |
| from pathlib import Path | |
| from scipy.spatial import Delaunay | |
| from datetime import datetime | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| from ideal_poly_volume_toolkit.rivin_delaunay import ( | |
| check_delaunay_realizability, | |
| random_edge_flips, | |
| ) | |
| def triangulation_to_dict(triangles, label=""): | |
| """Convert triangulation to serializable dict.""" | |
| return { | |
| 'label': label, | |
| 'n_vertices': int(len(set(v for tri in triangles for v in tri))), | |
| 'n_triangles': int(len(triangles)), | |
| 'triangles': [[int(v) for v in tri] for tri in triangles], # Convert numpy int32 to Python int | |
| } | |
| def generate_benchmark(n_points=150, n_flips=40, n_non_realizable=5, seed=42): | |
| """ | |
| Generate LLM benchmark with 1 realizable + N non-realizable triangulations. | |
| Note: Finding valid non-realizable triangulations is challenging because most | |
| random edge flips either (a) create invalid triangulations (edges in >2 triangles) | |
| or (b) remain realizable. We aim for n_non_realizable but may get fewer. | |
| Args: | |
| n_points: Number of vertices | |
| n_flips: Number of edge flips per non-realizable triangulation | |
| n_non_realizable: Target number of non-realizable cases (may get fewer) | |
| seed: Random seed | |
| Returns: | |
| Dict with benchmark data | |
| """ | |
| np.random.seed(seed) | |
| print("="*70) | |
| print("GENERATING LLM GEOMETRIC REASONING BENCHMARK") | |
| print("="*70) | |
| print() | |
| # Generate random points and compute Delaunay triangulation | |
| print(f"Step 1: Generate {n_points} random points") | |
| points = np.random.rand(n_points, 2) | |
| print(f" ✓ Points generated") | |
| print() | |
| print("Step 2: Compute Delaunay triangulation") | |
| tri = Delaunay(points) | |
| realizable_triangulation = [tuple(simplex) for simplex in tri.simplices] | |
| print(f" ✓ Triangulation: {len(realizable_triangulation)} triangles") | |
| print() | |
| # Verify it's realizable | |
| print("Step 3: Verify realizability") | |
| result = check_delaunay_realizability(realizable_triangulation, verbose=False) | |
| if not result['realizable']: | |
| print(" ✗ ERROR: Base triangulation not realizable (unexpected!)") | |
| return None | |
| print(f" ✓ Confirmed realizable (min angle: {np.degrees(result['min_angle_radians']):.2f}°)") | |
| print() | |
| # Generate non-realizable triangulations via edge flips | |
| print(f"Step 4: Generate up to {n_non_realizable} non-realizable triangulations ({n_flips} flips each)") | |
| print(f" (Using check_delaunay_realizability() to verify each is non-realizable)") | |
| print(f" Note: Many edge flips create invalid triangulations or remain realizable") | |
| print() | |
| non_realizable_triangulations = [] | |
| attempts = 0 | |
| max_attempts = 1000 # Many attempts needed due to filtering | |
| while len(non_realizable_triangulations) < n_non_realizable and attempts < max_attempts: | |
| attempts += 1 | |
| # Try different numbers of flips to get variety | |
| flips_to_try = n_flips + (attempts % 40) - 20 # Vary between n_flips-20 to n_flips+20 | |
| flips_to_try = max(20, flips_to_try) | |
| flipped = random_edge_flips( | |
| realizable_triangulation, | |
| n_flips=flips_to_try, | |
| seed=seed + attempts | |
| ) | |
| # IMPORTANT: First check it's a VALID triangulation (no edge in >2 triangles) | |
| from collections import Counter | |
| edge_count = Counter() | |
| for tri in flipped: | |
| v0, v1, v2 = tri | |
| for edge in [tuple(sorted([v0, v1])), tuple(sorted([v1, v2])), tuple(sorted([v2, v0]))]: | |
| edge_count[edge] += 1 | |
| if any(count > 2 for count in edge_count.values()): | |
| # Invalid triangulation - skip it | |
| if attempts % 10 == 0: | |
| print(f" (attempt {attempts}: invalid triangulation after flips, skipping...)") | |
| continue | |
| # IMPORTANT: Verify it's actually non-realizable using Rivin's LP test | |
| result = check_delaunay_realizability(flipped, verbose=False) | |
| if not result['realizable']: | |
| # Confirmed non-realizable AND valid triangulation! | |
| non_realizable_triangulations.append(flipped) | |
| lp_status = "infeasible" if result.get('success') and not result['realizable'] else result.get('message', 'unknown') | |
| print(f" ✓ Non-realizable #{len(non_realizable_triangulations)}: " | |
| f"{len(flipped)} triangles, {flips_to_try} flips, LP: {lp_status}") | |
| else: | |
| # Still realizable after flips - skip it | |
| if attempts % 10 == 0: | |
| print(f" (attempt {attempts}: still realizable after {flips_to_try} flips, continuing...)") | |
| if len(non_realizable_triangulations) < n_non_realizable: | |
| print() | |
| print(f" âš Warning: Only found {len(non_realizable_triangulations)}/{n_non_realizable} non-realizable triangulations") | |
| print(f" (This is expected - most edge flips create invalid or still-realizable triangulations)") | |
| print() | |
| # Package benchmark data | |
| print("Step 5: Package benchmark") | |
| challenges = [] | |
| # Add the realizable triangulation | |
| # IMPORTANT: Save the points as a certificate/proof | |
| challenges.append({ | |
| **triangulation_to_dict(realizable_triangulation, label="challenge_0"), | |
| 'is_realizable': True, | |
| 'solution_exists': True, | |
| 'certificate_points': points.tolist(), # The actual points that realize this triangulation | |
| }) | |
| # Add non-realizable triangulations | |
| for i, tri in enumerate(non_realizable_triangulations): | |
| challenges.append({ | |
| **triangulation_to_dict(tri, label=f"challenge_{i+1}"), | |
| 'is_realizable': False, | |
| 'solution_exists': False, | |
| }) | |
| # Shuffle challenges so the realizable one isn't always first | |
| np.random.seed(seed + 999) | |
| indices = np.arange(len(challenges)) | |
| np.random.shuffle(indices) | |
| challenges_shuffled = [challenges[i] for i in indices] | |
| benchmark = { | |
| 'metadata': { | |
| 'description': 'LLM Geometric Reasoning Benchmark', | |
| 'n_vertices': n_points, | |
| 'n_challenges': len(challenges_shuffled), | |
| 'n_realizable': 1, | |
| 'n_non_realizable': len(non_realizable_triangulations), | |
| 'target_non_realizable': n_non_realizable, | |
| 'note': 'Non-realizable count may be less than target due to edge flip constraints', | |
| 'generated': datetime.now().isoformat(), | |
| 'seed': seed, | |
| }, | |
| 'challenges': challenges_shuffled, | |
| 'instructions': ( | |
| "For each challenge, you are given a triangulation specified as a list of triangles. " | |
| "Each triangle is a tuple of three vertex indices. " | |
| "Your task: Either (1) produce a set of 2D points such that the Delaunay triangulation " | |
| "of those points has the same combinatorial structure as the given triangulation, " | |
| "OR (2) output 'None' if no such point set exists. " | |
| "Note: Vertex labels may permute - we check graph isomorphism using canonical forms." | |
| ), | |
| } | |
| print(f" ✓ Created {len(challenges_shuffled)} challenges") | |
| print(f" ✓ Challenges have been shuffled") | |
| print() | |
| return benchmark | |
| def main(): | |
| import argparse | |
| parser = argparse.ArgumentParser( | |
| description="Generate LLM geometric reasoning benchmark" | |
| ) | |
| parser.add_argument( | |
| "--points", | |
| type=int, | |
| default=150, | |
| help="Number of vertices (default: 150)", | |
| ) | |
| parser.add_argument( | |
| "--flips", | |
| type=int, | |
| default=40, | |
| help="Number of edge flips for non-realizable triangulations (default: 40)", | |
| ) | |
| parser.add_argument( | |
| "--non-realizable", | |
| type=int, | |
| default=5, | |
| help="Target number of non-realizable cases (default: 5, may get fewer)", | |
| ) | |
| parser.add_argument( | |
| "--seed", | |
| type=int, | |
| default=42, | |
| help="Random seed (default: 42)", | |
| ) | |
| parser.add_argument( | |
| "--output", | |
| type=str, | |
| default="llm_benchmark.json", | |
| help="Output JSON file (default: llm_benchmark.json)", | |
| ) | |
| args = parser.parse_args() | |
| print() | |
| print("#"*70) | |
| print("# LLM Geometric Reasoning Benchmark Generator") | |
| print("#"*70) | |
| print() | |
| benchmark = generate_benchmark( | |
| n_points=args.points, | |
| n_flips=args.flips, | |
| n_non_realizable=args.non_realizable, | |
| seed=args.seed | |
| ) | |
| if benchmark is None: | |
| print("✗ Benchmark generation failed") | |
| return 1 | |
| # Save to JSON | |
| output_path = Path(args.output) | |
| with open(output_path, 'w') as f: | |
| json.dump(benchmark, f, indent=2) | |
| print("="*70) | |
| print("BENCHMARK GENERATED") | |
| print("="*70) | |
| print(f"Output file: {output_path}") | |
| print(f"Total challenges: {len(benchmark['challenges'])}") | |
| print(f"Realizable: {benchmark['metadata']['n_realizable']}") | |
| print(f"Non-realizable: {benchmark['metadata']['n_non_realizable']}") | |
| print("="*70) | |
| return 0 | |
| if __name__ == "__main__": | |
| sys.exit(main()) | |