JustinTX's picture
Add files using upload-large-folder tool
14c9c2b verified
"""
Auxiliary Evaluation System for Circle Packing
===============================================
This module provides ADDITIONAL metrics and analysis beyond the ground truth evaluation.
It is designed to be extensible and allow for future LLM-generated evaluation code.
IMPORTANT:
- Does NOT replace or modify the ground truth evaluation (evaluate.py)
- Provides supplementary metrics that help guide evolution
- Designed for easy extension with new metrics (manual or LLM-generated)
Architecture:
1. MetricRegistry: Plugin system for registering metrics
2. AuxiliaryEvaluator: Orchestrates metric computation
3. Individual metric functions: Compute specific aspects
4. FeedbackGenerator: Generate actionable text feedback
"""
import numpy as np
from typing import Dict, Any, List, Tuple, Callable, Optional
from dataclasses import dataclass, field
from pathlib import Path
import json
@dataclass
class MetricResult:
"""Result of a single auxiliary metric computation."""
name: str
value: float
interpretation: str # "higher_better" or "lower_better" or "neutral"
description: str
details: Dict[str, Any] = field(default_factory=dict)
@dataclass
class AuxiliaryEvalConfig:
"""Configuration for auxiliary evaluation."""
enabled: bool = True
enabled_metrics: List[str] = field(default_factory=lambda: [
"spatial_uniformity",
"edge_utilization",
"density_variance",
"packing_efficiency",
"radius_distribution",
"gap_analysis",
"geometric_quality"
])
generate_text_feedback: bool = True
save_detailed_analysis: bool = True
class MetricRegistry:
"""
Registry for auxiliary metrics. Supports plugin-style registration.
This makes it easy to add new metrics (manually or LLM-generated).
"""
def __init__(self):
self._metrics: Dict[str, Callable] = {}
self._metric_metadata: Dict[str, Dict[str, str]] = {}
def register(
self,
name: str,
func: Callable,
description: str,
interpretation: str = "higher_better"
):
"""
Register a new metric function.
Args:
name: Unique metric identifier
func: Function with signature (centers, radii) -> MetricResult
description: Human-readable description
interpretation: "higher_better", "lower_better", or "neutral"
"""
self._metrics[name] = func
self._metric_metadata[name] = {
"description": description,
"interpretation": interpretation
}
print(f"[MetricRegistry] Registered metric: {name}")
def get(self, name: str) -> Optional[Callable]:
"""Get a metric function by name."""
return self._metrics.get(name)
def list_metrics(self) -> List[str]:
"""List all registered metric names."""
return list(self._metrics.keys())
def get_metadata(self, name: str) -> Dict[str, str]:
"""Get metadata for a metric."""
return self._metric_metadata.get(name, {})
# Global registry instance
METRIC_REGISTRY = MetricRegistry()
# ============================================================================
# MANUALLY DESIGNED AUXILIARY METRICS
# ============================================================================
# These are hand-crafted metrics that provide useful signals for evolution.
# Future versions will allow LLM to generate additional metrics.
# ============================================================================
def compute_spatial_uniformity(
centers: np.ndarray,
radii: np.ndarray
) -> MetricResult:
"""
Measure spatial uniformity using Voronoi cell analysis.
Higher score = more uniform distribution (desirable).
"""
try:
from scipy.spatial import Voronoi
# Compute Voronoi diagram
vor = Voronoi(centers)
# Compute "effective" cell sizes by measuring distance to nearest neighbors
# (Voronoi cells can be infinite for boundary points)
cell_sizes = []
for i in range(len(centers)):
# Find distances to all other centers
distances = np.linalg.norm(centers - centers[i], axis=1)
distances = distances[distances > 0] # Exclude self
if len(distances) > 0:
# Use mean distance to 3 nearest neighbors as proxy for cell size
k_nearest = min(3, len(distances))
avg_dist = np.mean(np.sort(distances)[:k_nearest])
cell_sizes.append(avg_dist)
if len(cell_sizes) == 0:
uniformity = 0.0
else:
# Lower coefficient of variation = more uniform
cv = np.std(cell_sizes) / (np.mean(cell_sizes) + 1e-9)
uniformity = 1.0 / (1.0 + cv) # Transform to [0, 1], higher is better
return MetricResult(
name="spatial_uniformity",
value=float(uniformity),
interpretation="higher_better",
description="Spatial distribution uniformity (Voronoi analysis)",
details={
"cell_size_mean": float(np.mean(cell_sizes)) if cell_sizes else 0.0,
"cell_size_std": float(np.std(cell_sizes)) if cell_sizes else 0.0,
"coefficient_of_variation": float(cv) if cell_sizes else 0.0
}
)
except Exception as e:
return MetricResult(
name="spatial_uniformity",
value=0.0,
interpretation="higher_better",
description="Spatial distribution uniformity (failed to compute)",
details={"error": str(e)}
)
def compute_edge_utilization(
centers: np.ndarray,
radii: np.ndarray
) -> MetricResult:
"""
Measure how well the packing utilizes the boundary.
Circles near edges/corners are often larger in optimal packings.
"""
boundary_threshold = 0.02 # Distance to be considered "touching"
touching_edges = 0
touching_corners = 0
for i, (center, radius) in enumerate(zip(centers, radii)):
x, y = center
# Check which edges are touched
touches_left = (x - radius) < boundary_threshold
touches_right = (x + radius) > (1.0 - boundary_threshold)
touches_bottom = (y - radius) < boundary_threshold
touches_top = (y + radius) > (1.0 - boundary_threshold)
# Count edge touches
edge_count = sum([touches_left, touches_right, touches_bottom, touches_top])
if edge_count == 2:
# Corner (touches two edges)
touching_corners += 1
elif edge_count == 1:
# Edge (touches one edge)
touching_edges += 1
# Normalize: ideally want good corner and edge utilization
corner_score = touching_corners / 4.0 # 4 corners max
edge_score = touching_edges / float(len(centers)) # Fraction of circles on edges
# Combined score (corners are more valuable)
utilization = 0.6 * corner_score + 0.4 * edge_score
return MetricResult(
name="edge_utilization",
value=float(utilization),
interpretation="higher_better",
description="Boundary and corner utilization",
details={
"corners_touched": touching_corners,
"edges_touched": touching_edges,
"corner_score": float(corner_score),
"edge_score": float(edge_score)
}
)
def compute_density_variance(
centers: np.ndarray,
radii: np.ndarray
) -> MetricResult:
"""
Measure density variance across a grid.
Lower variance = more uniform density (desirable).
"""
grid_size = 10
grid = np.zeros((grid_size, grid_size))
# Compute density contribution in each grid cell
for center, radius in zip(centers, radii):
x, y = center
# Find which grid cells this circle overlaps
x_cells = np.arange(grid_size)
y_cells = np.arange(grid_size)
for i in x_cells:
for j in y_cells:
# Center of grid cell
cell_x = (i + 0.5) / grid_size
cell_y = (j + 0.5) / grid_size
# Distance from circle center to cell center
dist = np.sqrt((cell_x - x)**2 + (cell_y - y)**2)
# Add contribution if circle overlaps cell (simplified)
if dist < radius:
grid[i, j] += np.pi * radius**2 # Area contribution
# Compute variance (lower is better)
variance = float(np.var(grid))
mean_density = float(np.mean(grid))
# Transform to score (higher is better)
# Use inverse of coefficient of variation
cv = np.sqrt(variance) / (mean_density + 1e-9)
uniformity_score = 1.0 / (1.0 + cv)
return MetricResult(
name="density_variance",
value=float(uniformity_score),
interpretation="higher_better",
description="Spatial density uniformity across grid",
details={
"grid_size": grid_size,
"variance": variance,
"mean_density": mean_density,
"cv": float(cv)
}
)
def compute_packing_efficiency(
centers: np.ndarray,
radii: np.ndarray
) -> MetricResult:
"""
Ratio of total circle area to unit square area.
Theoretical upper bound for 26 circles is unknown, but this gives relative measure.
"""
total_area = float(np.sum(np.pi * radii**2))
square_area = 1.0
efficiency = total_area / square_area
# Known best result is ~2.635 sum of radii
# Approximate area for that would be pi * (2.635/26)^2 * 26 ≈ 0.839
# (assuming equal radii for rough estimate)
estimated_best_efficiency = 0.84 # Rough estimate
relative_efficiency = efficiency / estimated_best_efficiency
return MetricResult(
name="packing_efficiency",
value=float(efficiency),
interpretation="higher_better",
description="Area utilization efficiency",
details={
"total_area": total_area,
"square_area": square_area,
"efficiency": efficiency,
"relative_to_estimated_best": float(relative_efficiency)
}
)
def compute_radius_distribution(
centers: np.ndarray,
radii: np.ndarray
) -> MetricResult:
"""
Analyze radius size distribution.
Optimal packings often have specific radius patterns.
"""
radius_mean = float(np.mean(radii))
radius_std = float(np.std(radii))
radius_min = float(np.min(radii))
radius_max = float(np.max(radii))
radius_range = radius_max - radius_min
# Count size categories
small_radii = np.sum(radii < radius_mean - 0.5 * radius_std)
medium_radii = np.sum(np.abs(radii - radius_mean) <= 0.5 * radius_std)
large_radii = np.sum(radii > radius_mean + 0.5 * radius_std)
# Diversity score (higher is often better for packings)
# Use entropy-like measure
sizes = [small_radii, medium_radii, large_radii]
proportions = np.array(sizes) / len(radii)
proportions = proportions[proportions > 0] # Remove zeros
entropy = -np.sum(proportions * np.log(proportions + 1e-9))
diversity = entropy / np.log(3) # Normalize to [0, 1]
return MetricResult(
name="radius_distribution",
value=float(diversity),
interpretation="neutral",
description="Radius size diversity",
details={
"mean": radius_mean,
"std": radius_std,
"min": radius_min,
"max": radius_max,
"range": radius_range,
"small_count": int(small_radii),
"medium_count": int(medium_radii),
"large_count": int(large_radii),
"diversity_score": float(diversity)
}
)
def compute_gap_analysis(
centers: np.ndarray,
radii: np.ndarray
) -> MetricResult:
"""
Programmatic detection of unused space (gaps).
Uses a sampling approach on a fine grid.
"""
sample_size = 50 # Grid resolution
total_samples = sample_size * sample_size
covered_samples = 0
# Sample points in unit square
for i in range(sample_size):
for j in range(sample_size):
px = (i + 0.5) / sample_size
py = (j + 0.5) / sample_size
# Check if this point is inside any circle
for center, radius in zip(centers, radii):
dist = np.sqrt((px - center[0])**2 + (py - center[1])**2)
if dist < radius:
covered_samples += 1
break
coverage = covered_samples / total_samples
gap_ratio = 1.0 - coverage
return MetricResult(
name="gap_analysis",
value=float(coverage),
interpretation="higher_better",
description="Area coverage (1 - gap ratio)",
details={
"covered_samples": covered_samples,
"total_samples": total_samples,
"coverage": float(coverage),
"gap_ratio": float(gap_ratio)
}
)
def compute_geometric_quality(
centers: np.ndarray,
radii: np.ndarray
) -> MetricResult:
"""
Analyze geometric quality using Delaunay triangulation.
Well-packed configurations tend to have good triangulation quality.
"""
try:
from scipy.spatial import Delaunay
tri = Delaunay(centers)
# Analyze triangle quality (aspect ratio)
triangle_qualities = []
for simplex in tri.simplices:
pts = centers[simplex]
# Compute edge lengths
edges = [
np.linalg.norm(pts[1] - pts[0]),
np.linalg.norm(pts[2] - pts[1]),
np.linalg.norm(pts[0] - pts[2])
]
# Triangle quality = min_edge / max_edge (1.0 = equilateral)
if max(edges) > 0:
quality = min(edges) / max(edges)
triangle_qualities.append(quality)
if len(triangle_qualities) > 0:
avg_quality = float(np.mean(triangle_qualities))
else:
avg_quality = 0.0
return MetricResult(
name="geometric_quality",
value=avg_quality,
interpretation="higher_better",
description="Delaunay triangulation quality",
details={
"num_triangles": len(tri.simplices),
"avg_triangle_quality": avg_quality,
"min_quality": float(np.min(triangle_qualities)) if triangle_qualities else 0.0,
"max_quality": float(np.max(triangle_qualities)) if triangle_qualities else 0.0
}
)
except Exception as e:
return MetricResult(
name="geometric_quality",
value=0.0,
interpretation="higher_better",
description="Delaunay triangulation quality (failed)",
details={"error": str(e)}
)
# ============================================================================
# REGISTER ALL METRICS
# ============================================================================
METRIC_REGISTRY.register(
"spatial_uniformity",
compute_spatial_uniformity,
"Spatial distribution uniformity using Voronoi analysis",
"higher_better"
)
METRIC_REGISTRY.register(
"edge_utilization",
compute_edge_utilization,
"Boundary and corner utilization",
"higher_better"
)
METRIC_REGISTRY.register(
"density_variance",
compute_density_variance,
"Spatial density uniformity across grid",
"higher_better"
)
METRIC_REGISTRY.register(
"packing_efficiency",
compute_packing_efficiency,
"Area utilization efficiency",
"higher_better"
)
METRIC_REGISTRY.register(
"radius_distribution",
compute_radius_distribution,
"Radius size diversity",
"neutral"
)
METRIC_REGISTRY.register(
"gap_analysis",
compute_gap_analysis,
"Area coverage analysis",
"higher_better"
)
METRIC_REGISTRY.register(
"geometric_quality",
compute_geometric_quality,
"Delaunay triangulation quality",
"higher_better"
)
# ============================================================================
# AUXILIARY EVALUATOR
# ============================================================================
class AuxiliaryEvaluator:
"""
Main class for computing auxiliary metrics.
Designed to be extensible for future LLM-generated metrics.
"""
def __init__(self, config: Optional[AuxiliaryEvalConfig] = None):
self.config = config or AuxiliaryEvalConfig()
self.registry = METRIC_REGISTRY
def evaluate(
self,
centers: np.ndarray,
radii: np.ndarray,
primary_score: float
) -> Dict[str, Any]:
"""
Compute all enabled auxiliary metrics.
Args:
centers: Circle centers (n, 2)
radii: Circle radii (n,)
primary_score: Ground truth primary score (sum of radii)
Returns:
Dictionary with auxiliary metrics and generated feedback
"""
if not self.config.enabled:
return {}
results = {}
metric_results: List[MetricResult] = []
# Compute all enabled metrics
for metric_name in self.config.enabled_metrics:
metric_func = self.registry.get(metric_name)
if metric_func is None:
print(f"[AuxiliaryEvaluator] Warning: Metric '{metric_name}' not found in registry")
continue
try:
metric_result = metric_func(centers, radii)
metric_results.append(metric_result)
# Add to results
results[metric_name] = metric_result.value
results[f"{metric_name}_details"] = metric_result.details
except Exception as e:
print(f"[AuxiliaryEvaluator] Error computing '{metric_name}': {e}")
results[metric_name] = 0.0
results[f"{metric_name}_details"] = {"error": str(e)}
# Generate text feedback
if self.config.generate_text_feedback:
feedback = self._generate_feedback(
metric_results,
primary_score,
centers,
radii
)
results["auxiliary_text_feedback"] = feedback
return results
def _generate_feedback(
self,
metric_results: List[MetricResult],
primary_score: float,
centers: np.ndarray,
radii: np.ndarray
) -> str:
"""Generate human-readable feedback based on metric results."""
lines = []
lines.append("=" * 60)
lines.append("AUXILIARY EVALUATION FEEDBACK")
lines.append("=" * 60)
lines.append(f"Primary Score (sum of radii): {primary_score:.4f}")
lines.append("")
lines.append("Auxiliary Metrics:")
lines.append("-" * 60)
# Organize metrics by performance
good_metrics = []
poor_metrics = []
neutral_metrics = []
for result in metric_results:
if result.interpretation == "higher_better":
if result.value >= 0.7:
good_metrics.append(result)
elif result.value < 0.5:
poor_metrics.append(result)
else:
neutral_metrics.append(result)
else:
neutral_metrics.append(result)
# Report good metrics
if good_metrics:
lines.append("\n✅ Strengths:")
for result in good_metrics:
lines.append(f" • {result.description}: {result.value:.3f}")
# Report areas for improvement
if poor_metrics:
lines.append("\n⚠️ Areas for Improvement:")
for result in poor_metrics:
lines.append(f" • {result.description}: {result.value:.3f}")
# Add specific suggestions
lines.append(f" → {self._get_suggestion(result)}")
# Report neutral metrics
if neutral_metrics:
lines.append("\n📊 Other Metrics:")
for result in neutral_metrics:
lines.append(f" • {result.description}: {result.value:.3f}")
# Add specific recommendations
lines.append("\n" + "-" * 60)
lines.append("💡 Actionable Recommendations:")
recommendations = self._generate_recommendations(metric_results, centers, radii)
for i, rec in enumerate(recommendations, 1):
lines.append(f" {i}. {rec}")
lines.append("=" * 60)
return "\n".join(lines)
def _get_suggestion(self, result: MetricResult) -> str:
"""Get specific suggestion based on metric result."""
suggestions = {
"spatial_uniformity": "Try redistributing circles to reduce clustering",
"edge_utilization": "Consider placing larger circles near boundaries and corners",
"density_variance": "Balance circle density across different regions",
"gap_analysis": "Identify and fill empty regions with additional circles or larger radii",
"geometric_quality": "Improve triangle quality in Delaunay triangulation"
}
return suggestions.get(result.name, "Consider optimizing this aspect")
def _generate_recommendations(
self,
metric_results: List[MetricResult],
centers: np.ndarray,
radii: np.ndarray
) -> List[str]:
"""Generate specific actionable recommendations."""
recommendations = []
# Find poorest performing metrics
prioritized = sorted(
[r for r in metric_results if r.interpretation == "higher_better"],
key=lambda r: r.value
)
for result in prioritized[:3]: # Top 3 areas to improve
if result.name == "spatial_uniformity" and result.value < 0.6:
recommendations.append(
f"Spatial uniformity is low ({result.value:.2f}). "
"Check for clustered regions and redistribute circles."
)
elif result.name == "edge_utilization" and result.value < 0.5:
details = result.details
corners = details.get("corners_touched", 0)
if corners < 4:
recommendations.append(
f"Only {corners}/4 corners are utilized. "
"Place larger circles at unused corners."
)
elif result.name == "gap_analysis" and result.value < 0.7:
gap_ratio = result.details.get("gap_ratio", 0)
recommendations.append(
f"Detected {gap_ratio*100:.1f}% unused space. "
"Consider increasing radii in sparse regions."
)
if not recommendations:
recommendations.append("Overall packing quality is good! Continue optimizing primary score.")
return recommendations
# ============================================================================
# CONVENIENCE FUNCTIONS
# ============================================================================
def evaluate_auxiliary(
centers: np.ndarray,
radii: np.ndarray,
primary_score: float,
config: Optional[AuxiliaryEvalConfig] = None
) -> Dict[str, Any]:
"""
Convenience function to run auxiliary evaluation.
Args:
centers: Circle centers (n, 2)
radii: Circle radii (n,)
primary_score: Ground truth primary score
config: Optional configuration
Returns:
Dictionary with auxiliary metrics
"""
evaluator = AuxiliaryEvaluator(config)
return evaluator.evaluate(centers, radii, primary_score)
def save_auxiliary_analysis(
results: Dict[str, Any],
output_path: str
):
"""Save detailed auxiliary analysis to JSON file."""
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
# Convert numpy types to native Python types for JSON serialization
def convert_types(obj):
if isinstance(obj, np.ndarray):
return obj.tolist()
elif isinstance(obj, np.integer):
return int(obj)
elif isinstance(obj, np.floating):
return float(obj)
elif isinstance(obj, dict):
return {k: convert_types(v) for k, v in obj.items()}
elif isinstance(obj, list):
return [convert_types(item) for item in obj]
return obj
results_serializable = convert_types(results)
with open(output_path, 'w') as f:
json.dump(results_serializable, f, indent=2)
print(f"[AuxiliaryEval] Detailed analysis saved to: {output_path}")
# ============================================================================
# FUTURE EXTENSION INTERFACE
# ============================================================================
class LLMGeneratedMetric:
"""
Interface for LLM-generated metrics (future feature).
This allows the evaluation agent to propose new metrics dynamically.
"""
def __init__(
self,
name: str,
code: str,
description: str,
interpretation: str = "higher_better"
):
self.name = name
self.code = code
self.description = description
self.interpretation = interpretation
self._compiled_func = None
def compile(self) -> bool:
"""
Compile and validate the LLM-generated code.
Returns True if successful, False otherwise.
"""
try:
# Create a safe execution environment
namespace = {
'np': np,
'MetricResult': MetricResult,
}
# Execute the code to define the function
exec(self.code, namespace)
# Extract the function (assume it's named after the metric)
if self.name in namespace:
self._compiled_func = namespace[self.name]
return True
else:
print(f"[LLMGeneratedMetric] Function '{self.name}' not found in generated code")
return False
except Exception as e:
print(f"[LLMGeneratedMetric] Failed to compile: {e}")
return False
def evaluate(self, centers: np.ndarray, radii: np.ndarray) -> Optional[MetricResult]:
"""Execute the compiled metric function."""
if self._compiled_func is None:
return None
try:
result = self._compiled_func(centers, radii)
if not isinstance(result, MetricResult):
# Try to wrap result
if isinstance(result, (int, float)):
result = MetricResult(
name=self.name,
value=float(result),
interpretation=self.interpretation,
description=self.description
)
return result
except Exception as e:
print(f"[LLMGeneratedMetric] Error executing '{self.name}': {e}")
return None
def register_to_global(self) -> bool:
"""Register this metric to the global registry."""
if self._compiled_func is None:
if not self.compile():
return False
METRIC_REGISTRY.register(
self.name,
lambda c, r: self.evaluate(c, r),
self.description,
self.interpretation
)
return True
# Example of how to use LLMGeneratedMetric (for future):
"""
# LLM generates this code:
llm_metric_code = '''
def corner_circle_size_metric(centers, radii):
# Find circles in corners
corner_circles = []
for i, (center, radius) in enumerate(zip(centers, radii)):
x, y = center
if (x < 0.1 or x > 0.9) and (y < 0.1 or y > 0.9):
corner_circles.append(radius)
if len(corner_circles) == 0:
score = 0.0
else:
score = sum(corner_circles) / len(corner_circles)
return MetricResult(
name="corner_circle_size",
value=score,
interpretation="higher_better",
description="Average size of circles in corners"
)
'''
# Create and register the metric
llm_metric = LLMGeneratedMetric(
name="corner_circle_size_metric",
code=llm_metric_code,
description="LLM-generated: Corner circle size analysis",
interpretation="higher_better"
)
if llm_metric.register_to_global():
print("Successfully registered LLM-generated metric!")
"""