Spaces:

igriv
/

idealpolyhedra

Sleeping

App Files Files Community

idealpolyhedra / examples /demo_llm_benchmark.py

igriv

Add HuggingFace Spaces support

e0ef700 5 months ago

raw

history blame contribute delete

4.46 kB

	#!/usr/bin/env python3
	"""
	Demo: How to use the LLM Geometric Reasoning Benchmark.

	This script demonstrates:
	1. Loading the benchmark
	2. Extracting a challenge
	3. Checking responses (both correct and incorrect examples)
	"""

	import numpy as np
	import json
	import sys
	from pathlib import Path

	sys.path.insert(0, str(Path(__file__).parent.parent))

	from check_llm_response import check_response


	def demo():
	print()
	print("="*70)
	print("LLM GEOMETRIC REASONING BENCHMARK - DEMO")
	print("="*70)
	print()

	# Load benchmark
	benchmark_path = Path(__file__).parent / "test_benchmark.json"

	if not benchmark_path.exists():
	print(f"Error: Benchmark file not found: {benchmark_path}")
	print("Run: python examples/generate_llm_benchmark.py first")
	return 1

	with open(benchmark_path, 'r') as f:
	benchmark = json.load(f)

	print(f"Loaded benchmark: {benchmark_path}")
	print(f" Challenges: {len(benchmark['challenges'])}")
	print(f" Vertices: {benchmark['metadata']['n_vertices']}")
	print()

	# Find a non-realizable challenge
	non_realizable_idx = None
	realizable_idx = None

	for i, challenge in enumerate(benchmark['challenges']):
	if not challenge['solution_exists']:
	non_realizable_idx = i
	else:
	realizable_idx = i

	# Demo 1: Correct response for non-realizable
	if non_realizable_idx is not None:
	print("="*70)
	print(f"DEMO 1: Correct answer for non-realizable challenge")
	print("="*70)
	print()

	challenge = benchmark['challenges'][non_realizable_idx]
	triangulation = [tuple(tri) for tri in challenge['triangles']]

	print(f"Challenge {non_realizable_idx} ({challenge['label']}):")
	print(f" Vertices: {challenge['n_vertices']}")
	print(f" Triangles: {challenge['n_triangles']}")
	print()
	print("LLM Response: None")
	print()

	result = check_response(triangulation, None, verbose=True)

	print()
	input("Press Enter to continue...")
	print()

	# Demo 2: Incorrect response for non-realizable (giving random points)
	if non_realizable_idx is not None:
	print("="*70)
	print(f"DEMO 2: Incorrect answer (random points for non-realizable)")
	print("="*70)
	print()

	challenge = benchmark['challenges'][non_realizable_idx]
	triangulation = [tuple(tri) for tri in challenge['triangles']]

	# Generate random points (wrong answer!)
	np.random.seed(999)
	wrong_points = np.random.rand(challenge['n_vertices'], 2)

	print(f"Challenge {non_realizable_idx} ({challenge['label']}):")
	print(f" Vertices: {challenge['n_vertices']}")
	print(f" Triangles: {challenge['n_triangles']}")
	print()
	print(f"LLM Response: Random point set (shape {wrong_points.shape})")
	print()

	result = check_response(triangulation, wrong_points, verbose=True)

	print()
	input("Press Enter to continue...")
	print()

	# Demo 3: Wrong answer for realizable (saying None)
	if realizable_idx is not None:
	print("="*70)
	print(f"DEMO 3: Incorrect answer (None for realizable)")
	print("="*70)
	print()

	challenge = benchmark['challenges'][realizable_idx]
	triangulation = [tuple(tri) for tri in challenge['triangles']]

	print(f"Challenge {realizable_idx} ({challenge['label']}):")
	print(f" Vertices: {challenge['n_vertices']}")
	print(f" Triangles: {challenge['n_triangles']}")
	print(f" Actually realizable: {challenge['solution_exists']}")
	print()
	print("LLM Response: None (wrong!)")
	print()

	result = check_response(triangulation, None, verbose=True)

	print()

	print("="*70)
	print("DEMO COMPLETE")
	print("="*70)
	print()
	print("Key takeaways:")
	print("1. Checker verifies realizability using Rivin's LP constraints")
	print("2. Checker uses pynauty for robust isomorphism checking")
	print("3. Vertex relabeling is handled automatically")
	print("4. Both false positives and false negatives are detected")
	print()
	print("Try it yourself:")
	print(" python examples/check_llm_response.py <benchmark.json> <challenge_idx> --points <None\|file.npy>")
	print()

	return 0


	if __name__ == "__main__":
	sys.exit(demo())