idealpolyhedra / examples /demo_llm_benchmark.py
igriv's picture
Add HuggingFace Spaces support
e0ef700
#!/usr/bin/env python3
"""
Demo: How to use the LLM Geometric Reasoning Benchmark.
This script demonstrates:
1. Loading the benchmark
2. Extracting a challenge
3. Checking responses (both correct and incorrect examples)
"""
import numpy as np
import json
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from check_llm_response import check_response
def demo():
print()
print("="*70)
print("LLM GEOMETRIC REASONING BENCHMARK - DEMO")
print("="*70)
print()
# Load benchmark
benchmark_path = Path(__file__).parent / "test_benchmark.json"
if not benchmark_path.exists():
print(f"Error: Benchmark file not found: {benchmark_path}")
print("Run: python examples/generate_llm_benchmark.py first")
return 1
with open(benchmark_path, 'r') as f:
benchmark = json.load(f)
print(f"Loaded benchmark: {benchmark_path}")
print(f" Challenges: {len(benchmark['challenges'])}")
print(f" Vertices: {benchmark['metadata']['n_vertices']}")
print()
# Find a non-realizable challenge
non_realizable_idx = None
realizable_idx = None
for i, challenge in enumerate(benchmark['challenges']):
if not challenge['solution_exists']:
non_realizable_idx = i
else:
realizable_idx = i
# Demo 1: Correct response for non-realizable
if non_realizable_idx is not None:
print("="*70)
print(f"DEMO 1: Correct answer for non-realizable challenge")
print("="*70)
print()
challenge = benchmark['challenges'][non_realizable_idx]
triangulation = [tuple(tri) for tri in challenge['triangles']]
print(f"Challenge {non_realizable_idx} ({challenge['label']}):")
print(f" Vertices: {challenge['n_vertices']}")
print(f" Triangles: {challenge['n_triangles']}")
print()
print("LLM Response: None")
print()
result = check_response(triangulation, None, verbose=True)
print()
input("Press Enter to continue...")
print()
# Demo 2: Incorrect response for non-realizable (giving random points)
if non_realizable_idx is not None:
print("="*70)
print(f"DEMO 2: Incorrect answer (random points for non-realizable)")
print("="*70)
print()
challenge = benchmark['challenges'][non_realizable_idx]
triangulation = [tuple(tri) for tri in challenge['triangles']]
# Generate random points (wrong answer!)
np.random.seed(999)
wrong_points = np.random.rand(challenge['n_vertices'], 2)
print(f"Challenge {non_realizable_idx} ({challenge['label']}):")
print(f" Vertices: {challenge['n_vertices']}")
print(f" Triangles: {challenge['n_triangles']}")
print()
print(f"LLM Response: Random point set (shape {wrong_points.shape})")
print()
result = check_response(triangulation, wrong_points, verbose=True)
print()
input("Press Enter to continue...")
print()
# Demo 3: Wrong answer for realizable (saying None)
if realizable_idx is not None:
print("="*70)
print(f"DEMO 3: Incorrect answer (None for realizable)")
print("="*70)
print()
challenge = benchmark['challenges'][realizable_idx]
triangulation = [tuple(tri) for tri in challenge['triangles']]
print(f"Challenge {realizable_idx} ({challenge['label']}):")
print(f" Vertices: {challenge['n_vertices']}")
print(f" Triangles: {challenge['n_triangles']}")
print(f" Actually realizable: {challenge['solution_exists']}")
print()
print("LLM Response: None (wrong!)")
print()
result = check_response(triangulation, None, verbose=True)
print()
print("="*70)
print("DEMO COMPLETE")
print("="*70)
print()
print("Key takeaways:")
print("1. Checker verifies realizability using Rivin's LP constraints")
print("2. Checker uses pynauty for robust isomorphism checking")
print("3. Vertex relabeling is handled automatically")
print("4. Both false positives and false negatives are detected")
print()
print("Try it yourself:")
print(" python examples/check_llm_response.py <benchmark.json> <challenge_idx> --points <None|file.npy>")
print()
return 0
if __name__ == "__main__":
sys.exit(demo())