Spaces:

igriv
/

idealpolyhedra

Running on CPU Upgrade

App Files Files Community

idealpolyhedra / examples /generate_llm_benchmark.py

igriv

Add HuggingFace Spaces support

e0ef700 about 1 month ago

raw

history blame contribute delete

9.73 kB

	#!/usr/bin/env python3
	"""
	Generate a benchmark for testing LLM geometric reasoning abilities.

	This creates a "torture test" where we present 10 triangulations:
	- 1 is Delaunay realizable (has a valid point set)
	- 9 are NOT realizable (created via edge flips)

	The challenge: Given only the triangulation, can the LLM:
	1. Produce a valid point set with that combinatorial structure, OR
	2. Correctly identify that no such point set exists?

	We use pynauty for robust isomorphism checking to verify answers.
	"""

	import numpy as np
	import json
	import sys
	from pathlib import Path
	from scipy.spatial import Delaunay
	from datetime import datetime

	sys.path.insert(0, str(Path(__file__).parent.parent))

	from ideal_poly_volume_toolkit.rivin_delaunay import (
	check_delaunay_realizability,
	random_edge_flips,
	)


	def triangulation_to_dict(triangles, label=""):
	"""Convert triangulation to serializable dict."""
	return {
	'label': label,
	'n_vertices': int(len(set(v for tri in triangles for v in tri))),
	'n_triangles': int(len(triangles)),
	'triangles': [[int(v) for v in tri] for tri in triangles], # Convert numpy int32 to Python int
	}


	def generate_benchmark(n_points=150, n_flips=40, n_non_realizable=5, seed=42):
	"""
	Generate LLM benchmark with 1 realizable + N non-realizable triangulations.

	Note: Finding valid non-realizable triangulations is challenging because most
	random edge flips either (a) create invalid triangulations (edges in >2 triangles)
	or (b) remain realizable. We aim for n_non_realizable but may get fewer.

	Args:
	n_points: Number of vertices
	n_flips: Number of edge flips per non-realizable triangulation
	n_non_realizable: Target number of non-realizable cases (may get fewer)
	seed: Random seed

	Returns:
	Dict with benchmark data
	"""
	np.random.seed(seed)

	print("="*70)
	print("GENERATING LLM GEOMETRIC REASONING BENCHMARK")
	print("="*70)
	print()

	# Generate random points and compute Delaunay triangulation
	print(f"Step 1: Generate {n_points} random points")
	points = np.random.rand(n_points, 2)
	print(f" ✓ Points generated")
	print()

	print("Step 2: Compute Delaunay triangulation")
	tri = Delaunay(points)
	realizable_triangulation = [tuple(simplex) for simplex in tri.simplices]
	print(f" ✓ Triangulation: {len(realizable_triangulation)} triangles")
	print()

	# Verify it's realizable
	print("Step 3: Verify realizability")
	result = check_delaunay_realizability(realizable_triangulation, verbose=False)
	if not result['realizable']:
	print(" ✗ ERROR: Base triangulation not realizable (unexpected!)")
	return None
	print(f" ✓ Confirmed realizable (min angle: {np.degrees(result['min_angle_radians']):.2f}°)")
	print()

	# Generate non-realizable triangulations via edge flips
	print(f"Step 4: Generate up to {n_non_realizable} non-realizable triangulations ({n_flips} flips each)")
	print(f" (Using check_delaunay_realizability() to verify each is non-realizable)")
	print(f" Note: Many edge flips create invalid triangulations or remain realizable")
	print()
	non_realizable_triangulations = []

	attempts = 0
	max_attempts = 1000 # Many attempts needed due to filtering

	while len(non_realizable_triangulations) < n_non_realizable and attempts < max_attempts:
	attempts += 1

	# Try different numbers of flips to get variety
	flips_to_try = n_flips + (attempts % 40) - 20 # Vary between n_flips-20 to n_flips+20
	flips_to_try = max(20, flips_to_try)

	flipped = random_edge_flips(
	realizable_triangulation,
	n_flips=flips_to_try,
	seed=seed + attempts
	)

	# IMPORTANT: First check it's a VALID triangulation (no edge in >2 triangles)
	from collections import Counter
	edge_count = Counter()
	for tri in flipped:
	v0, v1, v2 = tri
	for edge in [tuple(sorted([v0, v1])), tuple(sorted([v1, v2])), tuple(sorted([v2, v0]))]:
	edge_count[edge] += 1

	if any(count > 2 for count in edge_count.values()):
	# Invalid triangulation - skip it
	if attempts % 10 == 0:
	print(f" (attempt {attempts}: invalid triangulation after flips, skipping...)")
	continue

	# IMPORTANT: Verify it's actually non-realizable using Rivin's LP test
	result = check_delaunay_realizability(flipped, verbose=False)

	if not result['realizable']:
	# Confirmed non-realizable AND valid triangulation!
	non_realizable_triangulations.append(flipped)
	lp_status = "infeasible" if result.get('success') and not result['realizable'] else result.get('message', 'unknown')
	print(f" ✓ Non-realizable #{len(non_realizable_triangulations)}: "
	f"{len(flipped)} triangles, {flips_to_try} flips, LP: {lp_status}")
	else:
	# Still realizable after flips - skip it
	if attempts % 10 == 0:
	print(f" (attempt {attempts}: still realizable after {flips_to_try} flips, continuing...)")

	if len(non_realizable_triangulations) < n_non_realizable:
	print()
	print(f" ⚠ Warning: Only found {len(non_realizable_triangulations)}/{n_non_realizable} non-realizable triangulations")
	print(f" (This is expected - most edge flips create invalid or still-realizable triangulations)")
	print()

	# Package benchmark data
	print("Step 5: Package benchmark")

	challenges = []

	# Add the realizable triangulation
	# IMPORTANT: Save the points as a certificate/proof
	challenges.append({
	**triangulation_to_dict(realizable_triangulation, label="challenge_0"),
	'is_realizable': True,
	'solution_exists': True,
	'certificate_points': points.tolist(), # The actual points that realize this triangulation
	})

	# Add non-realizable triangulations
	for i, tri in enumerate(non_realizable_triangulations):
	challenges.append({
	**triangulation_to_dict(tri, label=f"challenge_{i+1}"),
	'is_realizable': False,
	'solution_exists': False,
	})

	# Shuffle challenges so the realizable one isn't always first
	np.random.seed(seed + 999)
	indices = np.arange(len(challenges))
	np.random.shuffle(indices)

	challenges_shuffled = [challenges[i] for i in indices]

	benchmark = {
	'metadata': {
	'description': 'LLM Geometric Reasoning Benchmark',
	'n_vertices': n_points,
	'n_challenges': len(challenges_shuffled),
	'n_realizable': 1,
	'n_non_realizable': len(non_realizable_triangulations),
	'target_non_realizable': n_non_realizable,
	'note': 'Non-realizable count may be less than target due to edge flip constraints',
	'generated': datetime.now().isoformat(),
	'seed': seed,
	},
	'challenges': challenges_shuffled,
	'instructions': (
	"For each challenge, you are given a triangulation specified as a list of triangles. "
	"Each triangle is a tuple of three vertex indices. "
	"Your task: Either (1) produce a set of 2D points such that the Delaunay triangulation "
	"of those points has the same combinatorial structure as the given triangulation, "
	"OR (2) output 'None' if no such point set exists. "
	"Note: Vertex labels may permute - we check graph isomorphism using canonical forms."
	),
	}

	print(f" ✓ Created {len(challenges_shuffled)} challenges")
	print(f" ✓ Challenges have been shuffled")
	print()

	return benchmark


	def main():
	import argparse

	parser = argparse.ArgumentParser(
	description="Generate LLM geometric reasoning benchmark"
	)
	parser.add_argument(
	"--points",
	type=int,
	default=150,
	help="Number of vertices (default: 150)",
	)
	parser.add_argument(
	"--flips",
	type=int,
	default=40,
	help="Number of edge flips for non-realizable triangulations (default: 40)",
	)
	parser.add_argument(
	"--non-realizable",
	type=int,
	default=5,
	help="Target number of non-realizable cases (default: 5, may get fewer)",
	)
	parser.add_argument(
	"--seed",
	type=int,
	default=42,
	help="Random seed (default: 42)",
	)
	parser.add_argument(
	"--output",
	type=str,
	default="llm_benchmark.json",
	help="Output JSON file (default: llm_benchmark.json)",
	)

	args = parser.parse_args()

	print()
	print("#"*70)
	print("# LLM Geometric Reasoning Benchmark Generator")
	print("#"*70)
	print()

	benchmark = generate_benchmark(
	n_points=args.points,
	n_flips=args.flips,
	n_non_realizable=args.non_realizable,
	seed=args.seed
	)

	if benchmark is None:
	print("✗ Benchmark generation failed")
	return 1

	# Save to JSON
	output_path = Path(args.output)
	with open(output_path, 'w') as f:
	json.dump(benchmark, f, indent=2)

	print("="*70)
	print("BENCHMARK GENERATED")
	print("="*70)
	print(f"Output file: {output_path}")
	print(f"Total challenges: {len(benchmark['challenges'])}")
	print(f"Realizable: {benchmark['metadata']['n_realizable']}")
	print(f"Non-realizable: {benchmark['metadata']['n_non_realizable']}")
	print("="*70)

	return 0


	if __name__ == "__main__":
	sys.exit(main())